In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
import re
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import torch
from torch.autograd import Variable
import sys
!pip install skipthoughts
!pip install vaderSentiment
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from skipthoughts import BiSkip



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
training_body = pd.read_csv('gdrive/MyDrive/fnc_data/train_bodies.csv')
print(training_body.head())
training_stance = pd.read_csv('gdrive/MyDrive/fnc_data/train_stances.csv')
print(training_stance.head())

   Body ID                                        articleBody
0        0  A small meteorite crashed into a wooded area i...
1        4  Last week we hinted at what was to come as Ebo...
2        5  (NEWSER) – Wonder how long a Quarter Pounder w...
3        6  Posting photos of a gun-toting child online, I...
4        7  At least 25 suspected Boko Haram insurgents we...
                                            Headline  Body ID     Stance
0  Police find mass graves with at least '15 bodi...      712  unrelated
1  Hundreds of Palestinians flee floods in Gaza a...      158      agree
2  Christian Bale passes on role of Steve Jobs, a...      137  unrelated
3  HBO and Apple in Talks for $15/Month Apple TV ...     1034  unrelated
4  Spider burrowed through tourist's stomach and ...     1923   disagree


In [None]:
def id2body(body_id,body_data):
    for i in range(len(body_data['articleBody'])):
        if(body_data['Body ID'][i] == body_id):
            return (body_data['articleBody'][i])

def preprocess(s):
    s = s.replace("\n","")
    s = s.replace("\'s","")
    s = s.replace("\'t","")
    return s

In [None]:
headline_wordcount = []
body_wordcount = []
headline = []
body = []
stance = []
for i in range(len(training_stance['Headline'])):
    headline_clean = preprocess(training_stance['Headline'][i])
    body_clean = preprocess(id2body(training_stance['Body ID'][i],training_body))
    headline.append(headline_clean)
    body.append(body_clean)
    headline_wordcount.append(len(headline_clean.split()))
    body_wordcount.append(len(body_clean.split()))
    stance.append(training_stance['Stance'][i])

In [None]:
train_set = pd.DataFrame()
train_set['Headline'] = headline
train_set['Body'] = body
train_set['Stance'] = stance
train_set['Headline Word Count'] = headline_wordcount
train_set['Body Word Count'] = body_wordcount
train_set.to_csv("gdrive/MyDrive/Data/train_Set.csv")

In [None]:
checking = pd.read_csv("gdrive/MyDrive/Data/train_Set.csv")
checking.head()

Unnamed: 0.1,Unnamed: 0,Headline,Body,Stance,Headline Word Count,Body Word Count
0,0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled filmSeth...,unrelated,19,189
1,1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,agree,11,417
2,2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,unrelated,16,189
3,3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,unrelated,14,78
4,4,Spider burrowed through tourist stomach and up...,"Fear not arachnophobes, the story of Bunbury ""...",disagree,10,588


In [None]:
testing_body = pd.read_csv('gdrive/MyDrive/fnc_data/competition_test_bodies.csv')
print(testing_body.shape)
testing_stance = pd.read_csv('gdrive/MyDrive/fnc_data/competition_test_stances.csv')
print(testing_stance.shape)

(904, 2)
(25413, 3)


In [None]:
headline_wordcount = []
body_wordcount = []
headline = []
body = []
stance = []
for i in range(len(testing_stance['Headline'])):
    headline_clean = preprocess(testing_stance['Headline'][i])
    body_clean = preprocess(id2body(testing_stance['Body ID'][i],testing_body))
    headline.append(headline_clean)
    body.append(body_clean)
    headline_wordcount.append(len(headline_clean.split()))
    body_wordcount.append(len(body_clean.split()))
    stance.append(testing_stance['Stance'][i])

In [None]:
test_set = pd.DataFrame()
test_set['Headline'] = headline
test_set['Body'] = body
test_set['Stance'] = stance
test_set['Headline Word Count'] = headline_wordcount
test_set['Body Word Count'] = body_wordcount
test_set.to_csv("gdrive/MyDrive/Data/test_Set.csv")

In [None]:
_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def preprocess(headlines,bodies):
  n_headlines, n_bodies =[],[]
  for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
    clean_headline = clean(headline)
    clean_body = clean(body)
    clean_headline = get_tokenized_lemmas(clean_headline)
    clean_body = get_tokenized_lemmas(clean_body)
    clean_headline = remove_stopwords(clean_headline)
    clean_body = remove_stopwords(clean_body)
    n_headlines.append(headline)
    n_bodies.append(body)
  n_headlines_df=pd.DataFrame(n_headlines,columns=['Headline'])
  n_bodies_df=pd.DataFrame(n_bodies,columns=['Body'])
  return n_headlines_df['Headline'], n_bodies_df['Body']

In [None]:
def statistical_features(dataset_loc , train=True, prev_head_trained=None, prev_body_trained=None):
  df = pd.read_csv(dataset_loc)

  # preprocessing the text i.e., creating tokens, removing stop words, removing alpha-numerals,..
  df['Headline'], df['Body'] = preprocess(df['Headline'], df['Body'])

  # Initialization
  head_trained = None
  body_trained = None

  # If training phase, fit and transform with the training data
  if train:
    headline_vectorizer = TfidfVectorizer()
    head_trained = headline_vectorizer.fit(df['Headline'])
    h = head_trained.transform(df['Headline'])

    body_vectorizer = TfidfVectorizer(max_features=10000-h.shape[1])
    body_trained = body_vectorizer.fit(df['Body'])
    b = body_trained.transform(df['Body'])

  # Else, just use the previous fitted vectorizer to transform the data
  else:
    h = prev_head_trained.transform(df['Headline'])
    b = prev_body_trained.transform(df['Body'])

  # Concatenation of resultant features
  statistical_features = np.concatenate((np.array(h.toarray()),np.array(b.toarray())),axis = 1)

  # Returning features and vectorizer
  return statistical_features, head_trained, body_trained

In [None]:
statistical_features_train, head_vect, body_vect = statistical_features('gdrive/MyDrive/Data/train_Set.csv',1)

49972it [03:18, 251.72it/s]
49972it [03:02, 273.71it/s]


In [None]:
statistical_features_test, head_vect, body_vect = statistical_features('gdrive/MyDrive/Data/test_Set.csv',0,head_vect, body_vect)

25413it [01:55, 220.30it/s]
49972it [03:32, 235.45it/s]


In [None]:
print(np.count_nonzero(statistical_features_train[500]))
print(np.count_nonzero(statistical_features_test[500]))

314
179


In [None]:
np.save(arr=statistical_features_train,file='gdrive/MyDrive/Data/train_statistical_features.npy')
np.save(arr=statistical_features_test,file='gdrive/MyDrive/Data/test_statistical_features.npy')

In [None]:
print(statistical_features_train.shape)
print(statistical_features_test.shape)

(49972, 10000)
(25413, 10000)


In [None]:
def external_features(dataset_loc):
    # Read the dataset
    english_dataset = pd.read_csv(dataset_loc)

    # Extract headline and body from the dataset
    headline = english_dataset['Headline']
    body = english_dataset['Body']

    eng_ext = []  # List to store the external features
    i = 0

    for sent1, sent2 in zip(headline, body):
        print(i)
        i += 1
        vec = []  # List to store the features for each pair of sentences

        # Character ngrams
        for n in range(2, 17):
            n_grams_1 = ngrams(sent1.lower(), n)
            n_grams_2 = ngrams(sent2.lower(), n)
            vec.append(len(list(set(n_grams_1).intersection(set(n_grams_2)))))

            n_grams_3 = ngrams(sent2.lower()[:255], n)
            temp_c1 = len(list(set(n_grams_1).intersection(set(n_grams_3))))

            n_grams_4 = ngrams(sent2.lower()[:100], n)
            temp_c2 = len(list(set(n_grams_1).intersection(set(n_grams_4))))

            vec.append(temp_c1)
            vec.append(temp_c2)

        # Word ngrams
        for n in range(2, 7):
            n_grams_1 = ngrams(sent1.lower().split(), n)
            n_grams_2 = ngrams(sent2.lower().split(), n)
            vec.append(len(list(set(n_grams_1).intersection(set(n_grams_2)))))

            n_grams_3 = ngrams(sent2.lower()[:255].split(), n)
            temp_c = len(list(set(n_grams_1).intersection(set(n_grams_3))))

            vec.append(temp_c)

        # Jaccard similarity
        s1 = sent1.split()
        s2 = sent2.split()
        vec.append(len(set(s1).intersection(s2)) / float(len(set(s1).union(s2))))

        # Sentiment analysis
        sid_obj = SentimentIntensityAnalyzer()
        d1 = sid_obj.polarity_scores(sent1)
        d2 = sid_obj.polarity_scores(sent2)

        vec.append(np.absolute(d1['neg'] - d2['neg']))
        vec.append(np.absolute(d1['neu'] - d2['neu']))
        vec.append(np.absolute(d1['pos'] - d2['pos']))
        vec.append(np.absolute(d1['compound'] - d2['compound']))

        eng_ext.append(vec)

    eng_ext = np.array(eng_ext)
    return eng_ext

# Example usage:
# features = external_features('your_dataset.csv')


In [None]:
train_external_features = external_features('gdrive/MyDrive/Data/train_Set.csv')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
44972
44973
44974
44975
44976
44977
44978
44979
44980
44981
44982
44983
44984
44985
44986
44987
44988
44989
44990
44991
44992
44993
44994
44995
44996
44997
44998
44999
45000
45001
45002
45003
45004
45005
45006
45007
45008
45009
45010
45011
45012
45013
45014
45015
45016
45017
45018
45019
45020
45021
45022
45023
45024
45025
45026
45027
45028
45029
45030
45031
45032
45033
45034
45035
45036
45037
45038
45039
45040
45041
45042
45043
45044
45045
45046
45047
45048
45049
45050
45051
45052
45053
45054
45055
45056
45057
45058
45059
45060
45061
45062
45063
45064
45065
45066
45067
45068
45069
45070
45071
45072
45073
45074
45075
45076
45077
45078
45079
45080
45081
45082
45083
45084
45085
45086
45087
45088
45089
45090
45091
45092
45093
45094
45095
45096
45097
45098
45099
45100
45101
45102
45103
45104
45105
45106
45107
45108
45109
45110
45111
45112
45113
45114
45115
45116
45117
45118
45119
45120
45121
45122
45123
45124
45125
45126
45127

In [None]:
test_external_features = external_features('gdrive/MyDrive/Data/test_Set.csv')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
20413
20414
20415
20416
20417
20418
20419
20420
20421
20422
20423
20424
20425
20426
20427
20428
20429
20430
20431
20432
20433
20434
20435
20436
20437
20438
20439
20440
20441
20442
20443
20444
20445
20446
20447
20448
20449
20450
20451
20452
20453
20454
20455
20456
20457
20458
20459
20460
20461
20462
20463
20464
20465
20466
20467
20468
20469
20470
20471
20472
20473
20474
20475
20476
20477
20478
20479
20480
20481
20482
20483
20484
20485
20486
20487
20488
20489
20490
20491
20492
20493
20494
20495
20496
20497
20498
20499
20500
20501
20502
20503
20504
20505
20506
20507
20508
20509
20510
20511
20512
20513
20514
20515
20516
20517
20518
20519
20520
20521
20522
20523
20524
20525
20526
20527
20528
20529
20530
20531
20532
20533
20534
20535
20536
20537
20538
20539
20540
20541
20542
20543
20544
20545
20546
20547
20548
20549
20550
20551
20552
20553
20554
20555
20556
20557
20558
20559
20560
20561
20562
20563
20564
20565
20566
20567
20568

In [None]:
np.save(arr=train_external_features,file='gdrive/MyDrive/Data/train_external_features.npy')
np.save(arr=test_external_features,file='gdrive/MyDrive/Data/test_external_features.npy')

In [None]:
from skipthoughts import BiSkip

my_file = open("gdrive/MyDrive/fnc_data/dictionary.txt", "r")
data = my_file.read()
data_into_list_temp = data.replace('\n', ' ').split(" ")
data_into_list = list(set(data_into_list_temp))

dir_st = 'data/skip-thoughts'
vocab = data_into_list
biskip = BiSkip(dir_st, vocab)

In [None]:
my_dict = {}
for i in range(len(vocab)):
  my_dict[vocab[i]] = i+1
my_dict['ok']

155233

In [None]:
import pickle

In [None]:
pickle.dump(my_dict, open('gdrive/MyDrive/Data/my_dict.pkl', 'wb'))
pickle.dump(biskip, open('gdrive/MyDrive/Data/biskip.pkl', 'wb'))

In [None]:

my_dict = pickle.load(open('gdrive/MyDrive/Data/my_dict.pkl', 'rb'))
biskip = pickle.load(open('gdrive/MyDrive/Data/biskip.pkl', 'rb'))

In [None]:
def neural_features(dataset_location):
  dataset = pd.read_csv(dataset_location)
  headlines = dataset['Headline']
  bodies = dataset['Body']
  # MAX_HEADLINE_COUNT = dataset['Headline Word Count'].max()
  # MAX_BODY_COUNT = dataset['Body Word Count'].max()
  # headlines_to_ids = np.zeros((len(headlines),MAX_HEADLINE_COUNT+1))
  # bodies_to_ids = np.zeros((len(bodies),MAX_BODY_COUNT+1))
  headlines_encodings = np.zeros((len(headlines),2400))
  bodies_encodings = np.zeros((len(bodies),2400))
  for i in range(len(headlines)):
    headline = headlines[i].split()
    headline.append('')
    body = bodies[i].split()
    body.append('')
    j=0
    for word in headline:
      try:
        headlines_to_ids[i][j] = my_dict[word]
      except KeyError:
        pass
      j+=1
    j=0
    for word in body:
      try:
        bodies_to_ids[i][j] = my_dict[word]
      except KeyError:
        pass
      j+=1
  last_temp = len(headlines) - len(headlines)%50
  for i in range(0,len(headlines),50):
    print(i)
    input1 = Variable(torch.LongTensor(headlines_to_ids[i:i+50]))
    input2 = Variable(torch.LongTensor(bodies_to_ids[i:i+50]))
    headline_output = biskip(input1).detach().numpy()
    body_output = biskip(input2).detach().numpy()
    headlines_encodings[i:i+50] = headline_output[0:50]
    bodies_encodings[i:i+50] = body_output[0:50]
  if(last_temp != len(headlines)):
    print(last_temp)
    input1 = Variable(torch.LongTensor(headlines_to_ids[last_temp:]))
    input2 = Variable(torch.LongTensor(bodies_to_ids[last_temp:]))
    headline_output = biskip(input1).detach().numpy()
    body_output = biskip(input2).detach().numpy()
    headlines_encodings[last_temp:] = headline_output[:]
    bodies_encodings[last_temp:] = body_output[:]

  feat1 = np.zeros((len(headlines),2400))
  feat2 = np.zeros((len(headlines),2400))
  i = 0
  for h_vector,b_vector in zip(headlines_encodings,bodies_encodings):
    feat1[i] = np.multiply(h_vector,b_vector)
    feat2[i] = np.absolute(h_vector-b_vector)
    i+=1

  final_neural_features = np.concatenate((feat1,feat2),axis = 1)
  return final_neural_features

In [None]:
neural_features_train = neural_features('gdrive/MyDrive/Data/train_Set.csv')

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
6700
6750
6800
6850
6900
6950
7000
7050
7100
7150
7200
7250
7300
7350
7400
7450
7500
7550
7600
7650
7700
7750
7800
7850
7900
7950
8000
8050
8100
8150
8200
8250
8300
8350
8400
8450
8500
8550
8600
8650
8700
8750
8800
8850
8900
8950
9000
9050
9100
9150
9200
9250
9300
9350
9400
9450
9500
9550
9600
9650
9700
9750
9800
9850
9900
9950
10000
10050
10100
10150

In [None]:
np.save(arr=neural_features_train,file='gdrive/MyDrive/Data/train_neural_features.npy')

In [None]:
neural_features_test = neural_features('gdrive/MyDrive/Data/test_Set.csv')

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
6700
6750
6800
6850
6900
6950
7000
7050
7100
7150
7200
7250
7300
7350
7400
7450
7500
7550
7600
7650
7700
7750
7800
7850
7900
7950
8000
8050
8100
8150
8200
8250
8300
8350
8400
8450
8500
8550
8600
8650
8700
8750
8800
8850
8900
8950
9000
9050
9100
9150
9200
9250
9300
9350
9400
9450
9500
9550
9600
9650
9700
9750
9800
9850
9900
9950
10000
10050
10100
10150

In [None]:
np.save(arr=neural_features_test,file='gdrive/MyDrive/Data/test_neural_features.npy')
print(neural_features_test[25402][100:150])