In [33]:
!pip install lime
!pip install gradio
!pip install skipthoughts
!pip install vaderSentiment
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import nltk
import pandas as pd
from nltk.util import ngrams
import pickle
from torch.autograd import Variable
import sys
from nltk.corpus import stopwords
import re
from sklearn import feature_extraction
from tqdm import tqdm

In [35]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [36]:
my_dict = pickle.load(open('gdrive/MyDrive/Data/my_dict.pkl', 'rb'))
biskip = pickle.load(open('gdrive/MyDrive/Data/biskip.pkl', 'rb'))

In [37]:
def neural_features(headlines,bodies):
  MAX_HEAD = max(len(ele) for ele in headlines)
  MAX_BODY = max(len(ele) for ele in bodies)
  headlines_to_ids = np.zeros((len(headlines),MAX_HEAD+1))
  bodies_to_ids = np.zeros((len(bodies),MAX_BODY+1))
  headlines_encodings = np.zeros((len(headlines),2400))
  bodies_encodings = np.zeros((len(bodies),2400))
  for i in range(len(headlines)):
    headline = headlines[i].split()
    headline.append('<eos>')
    body = bodies[i].split()
    body.append('<eos>')
    j=0
    for word in headline:
      try:
        headlines_to_ids[i][j] = my_dict[word]
      except KeyError:
        pass
      j+=1
    j=0
    for word in body:
      try:
        bodies_to_ids[i][j] = my_dict[word]
      except KeyError:
        pass
      j+=1
  last_temp = len(headlines) - len(headlines)%50
  for i in range(0,len(headlines),50):
    # print(i)
    input1 = Variable(torch.LongTensor(headlines_to_ids[i:i+50]))
    input2 = Variable(torch.LongTensor(bodies_to_ids[i:i+50]))
    headline_output = biskip(input1).detach().numpy()
    body_output = biskip(input2).detach().numpy()
    headlines_encodings[i:i+50] = headline_output[0:50]
    bodies_encodings[i:i+50] = body_output[0:50]
  if(last_temp != len(headlines)):
    input1 = Variable(torch.LongTensor(headlines_to_ids[last_temp:]))
    input2 = Variable(torch.LongTensor(bodies_to_ids[last_temp:]))
    headline_output = biskip(input1).detach().numpy()
    body_output = biskip(input2).detach().numpy()
    headlines_encodings[last_temp:] = headline_output[:]
    bodies_encodings[last_temp:] = body_output[:]

  feat1 = np.zeros((len(headlines),2400))
  feat2 = np.zeros((len(headlines),2400))
  i = 0
  for h_vector,b_vector in zip(headlines_encodings,bodies_encodings):
    feat1[i] = np.multiply(h_vector,b_vector)
    feat2[i] = np.absolute(h_vector-b_vector)
    i+=1

  final_neural_features = np.concatenate((feat1,feat2),axis = 1)
  return final_neural_features


In [38]:

_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def preprocess(headlines,bodies):
  n_headlines, n_bodies =[],[]
  for i, (headline, body) in enumerate(zip(headlines, bodies)):
    clean_headline = clean(headline)
    clean_body = clean(body)
    clean_headline = get_tokenized_lemmas(clean_headline)
    clean_body = get_tokenized_lemmas(clean_body)
    clean_headline = remove_stopwords(clean_headline)
    clean_body = remove_stopwords(clean_body)
    n_headlines.append(headline)
    n_bodies.append(body)
  n_headlines_df=pd.DataFrame(n_headlines,columns=['Headline'])
  n_bodies_df=pd.DataFrame(n_bodies,columns=['Body'])
  return n_headlines_df['Headline'], n_bodies_df['Body']


In [39]:
df = pd.read_csv('gdrive/MyDrive/Data/train_Set.csv')
org_hs, org_bs = df['Headline'], df['Body']
org_hs,org_bs = preprocess(org_hs,org_bs)
headline_vectorizer = TfidfVectorizer()
h1 = headline_vectorizer.fit(org_hs)
def statistical_features(headlines,bodies):
  # stop_words_l=stopwords.words('english')
  # org_hs, org_bs = df['Headline'], df['Body']
  # org_hs,org_bs = preprocess(org_hs,org_bs)
  # headline_vectorizer = TfidfVectorizer()
  # h1 = headline_vectorizer.fit(org_hs)
  h = h1.transform(headlines)
  body_vectorizer = TfidfVectorizer(max_features=10000-h.shape[1])
  b1 = body_vectorizer.fit(org_bs)
  b = b1.transform(bodies)
  statistical_features = np.concatenate((np.array(h.toarray()),np.array(b.toarray())),axis = 1)
  return statistical_features

In [40]:
def external_features(headline,body):
  eng_ext = []
  i = 0
  for sent1,sent2 in zip(headline,body):
    i+=1
    vec = []

    #character ngrams
    for n in range(2,17):
      n_grams_1 = ngrams(sent1.lower(), n)
      n_grams_2 = ngrams(sent2.lower(),n)
      vec.append(len(list(set(n_grams_1).intersection(set(n_grams_2)))))
      temp_c1=0
      temp_c2=0
      n_grams_1 = ngrams(sent1.lower(), n)
      n_grams_3 = ngrams(sent2.lower()[:255],n)
      temp_c1 = len(list(set(n_grams_1).intersection(set(n_grams_3))))
      n_grams_1 = ngrams(sent1.lower(), n)
      n_grams_4 = ngrams(sent2.lower()[:100],n)
      temp_c2 = len(list(set(n_grams_1).intersection(set(n_grams_4))))
      vec.append(temp_c1)
      vec.append(temp_c2)

    #word ngrams
    for n in range(2,7):
      n_grams_1 = ngrams(sent1.lower().split(), n)
      n_grams_2 = ngrams(sent2.lower().split(),n)
      vec.append(len(list(set(n_grams_1).intersection(set(n_grams_2)))))
      temp_c=0
      n_grams_1 = ngrams(sent1.lower().split(), n)
      n_grams_3 = ngrams(sent2.lower()[:255].split(),n)
      temp_c=len(list(set(n_grams_1).intersection(set(n_grams_3))))
      vec.append(temp_c)

    #no of common words between headline and body with respect to total words
    s1 = sent1.split()
    s2 = sent2.split()
    vec.append(len(set(s1).intersection(s2)) / float(len(set(s1).union(s2))))

    sid_obj = SentimentIntensityAnalyzer()
    d1 = sid_obj.polarity_scores(sent1)
    d2 = sid_obj.polarity_scores(sent2)
    vec.append(np.absolute(d1['neg']-d2['neg']))
    vec.append(np.absolute(d1['neu']-d2['neu']))
    vec.append(np.absolute(d1['pos']-d2['pos']))
    vec.append(np.absolute(d1['compound']-d2['compound']))

    eng_ext.append(vec)

  eng_ext = np.array(eng_ext)
  return eng_ext


In [41]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        # Define the first branch
        self.x_model = nn.Sequential(
            nn.Linear(4800, 500),
            nn.Sigmoid(),
            nn.Dropout(0.2),
            nn.Linear(500, 100),
            nn.Sigmoid()
        )

        # Define the second branch
        self.y_model = nn.Sequential(
            nn.Linear(10000, 500),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(500, 50),
            nn.ReLU()
        )

        # Define the third branch
        self.z_model = nn.Sequential(
            nn.Linear(60, 60),
            nn.ReLU()
        )

        # Combine the three branches
        self.fc_combined = nn.Sequential(
            nn.Linear(100 + 50 + 60, 4),
            nn.Softmax(dim=1)
        )

    def forward(self, x, y, z):
        x_out = self.x_model(x)
        y_out = self.y_model(y)
        z_out = self.z_model(z)
        combined = torch.cat([x_out, y_out, z_out], dim=1)
        output = self.fc_combined(combined)
        return output

# Instantiate the model
model = MyModel()

In [42]:
model.load_state_dict(torch.load('gdrive/MyDrive/Data/final_model.pth'))

<All keys matched successfully>

In [43]:
def check(headline,body):
  headlines_org = []
  bodies_org = []
  headlines_org.append(headline)
  headlines_org.append(headline)
  bodies_org.append(body)
  bodies_org.append(body)
  n_feats = neural_features(headlines_org,bodies_org)
  s_feats = statistical_features(headlines_org,bodies_org)
  e_feats = external_features(headlines_org,bodies_org)
  e_feats = torch.tensor(e_feats, dtype=torch.float32)
  n_feats = torch.tensor(n_feats, dtype=torch.float32)
  s_feats = torch.tensor(s_feats, dtype=torch.float32)
  predicted = None
  with torch.no_grad():
      outputs = model(n_feats,s_feats,e_feats)
      _, predicted = torch.max(outputs, 1)
  index_to_stance = {0: 'agree', 1: 'disagree', 2: 'discuss', 3: 'unrelated'}
  predictions = [index_to_stance[p.item()] for p in predicted.cpu().numpy()]
  return predictions[0]

In [44]:
head = 'ISIL Beheads American Photojournalist in Iraq' #discuss
body = 'James Foley, an American journalist who went missing in Syria more than a year ago, has reportedly been executed by the Islamic State, a militant group formerly known as ISIS.Video and photos purportedly of Foley emerged on Tuesday. A YouTube video -- entitled "A Message to #America (from the #IslamicState)" -- identified a man on his knees as "James Wright Foley," and showed his execution.This is a developing story. Check back here for updates.'
check(head,body)

'discuss'

In [45]:

# !pip install -U gradio
# import gradio as gr

# iface = gr.Interface(
#     fn=check,
#     inputs=["text", "text"],
#     outputs="text",
#     live=True
# )

# iface.launch()
