# Perform Necessary Imports

In [1]:
import pandas as pd
import xgboost
import re
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import f1_score


# Read filtered Dataset

In [3]:
totalData = pd.read_csv('../datasets/nela-gt/nela10.csv')

# How it looks like?

In [4]:
totalData

Unnamed: 0,id,date,source,title,content,author,url,published,published_utc,collection_utc,Reliability
0,abcnews--2019-01-30--Who is Stacey Abrams the ...,2019-01-30,abcnews,"Who is Stacey Abrams, the Democrat who will re...","Last November, Stacey Abrams, a Democrat who r...",Cheyenne Haslett,https://abcnews.go.com/Politics/stacey-abrams-...,2019-01-30 19:10:47+00:00,1548893447,1567550233,0
1,abcnews--2019-01-30--Ban government shutdowns ...,2019-01-30,abcnews,Ban government shutdowns? Some Republicans and...,As lawmakers scramble to negotiate a border se...,"Benjamin Siegel,\nTrish Turner\n",https://abcnews.go.com/Politics/ban-government...,2019-01-30 17:29:04+00:00,1548887344,1567550233,0
2,abcnews--2019-01-30--Democrats renew push for ...,2019-01-30,abcnews,Democrats renew push for equal pay for equal work,Ten years after President Barack Obama signed ...,John Parkinson,https://abcnews.go.com/Politics/dems-renew-pus...,2019-01-30 21:35:45+00:00,1548902145,1567550233,0
3,abcnews--2019-01-30--Trump attacks intel chief...,2019-01-30,abcnews,Trump attacks intel chiefs after they contradi...,"President Donald Trump on Wednesday declared ""...",Meridith Mcgraw,https://abcnews.go.com/Politics/trump-attacks-...,2019-01-30 17:03:40+00:00,1548885820,1567550233,0
4,abcnews--2019-01-30--Border security battle be...,2019-01-30,abcnews,Border security battle begins as Congress grap...,Republicans and Democrats on the exclusive com...,"John Parkinson,\nBenjamin Siegel\n",https://abcnews.go.com/Politics/border-securit...,2019-01-30 15:09:47+00:00,1548878987,1567550233,0
...,...,...,...,...,...,...,...,...,...,...,...
57152,zerohedge--2019-11-01--Hong Kong Officer Faces...,2019-11-01,zerohedge,Hong Kong Officer Faces Death Threats After Fi...,It's unclear exactly when it happened (the BBG...,Tyler Durden,http://feedproxy.google.com/~r/zerohedge/feed/...,"Fri, 01 Nov 2019 22:45:00 +0000",1572662700,1572648625,2
57153,"zerohedge--2019-11-01--Enough ""Quid Pro Quo"" G...",2019-11-01,zerohedge,"Enough ""Quid Pro Quo"" Gaslighting!",Horse trading is the oxygen of politics; it is...,Tyler Durden,http://feedproxy.google.com/~r/zerohedge/feed/...,"Fri, 01 Nov 2019 22:25:00 +0000",1572661500,1572648625,2
57154,"zerohedge--2019-11-01--""Born For This? I Don't...",2019-11-01,zerohedge,"""Born For This? I Don't Think So"" - Trump Mock...",Having plunged from over 10% to just 1% in the...,Tyler Durden,http://feedproxy.google.com/~r/zerohedge/feed/...,"Fri, 01 Nov 2019 21:56:35 +0000",1572659795,1572648626,2
57155,zerohedge--2019-11-01--How Iran Used Google To...,2019-11-01,zerohedge,How Iran Used Google To Disrupt 5% Of Global O...,Officials at Saudi Aramco believe that Iran us...,Tyler Durden,http://feedproxy.google.com/~r/zerohedge/feed/...,"Fri, 01 Nov 2019 21:45:00 +0000",1572659100,1572648627,2


# Keep only content, Reliability as columns

In [5]:
#Keep only content, Reliability as columns
totalData = totalData.drop(['id','date','source','title','author','url','published','published_utc','collection_utc'],axis=1)

In [6]:
totalData

Unnamed: 0,content,Reliability
0,"Last November, Stacey Abrams, a Democrat who r...",0
1,As lawmakers scramble to negotiate a border se...,0
2,Ten years after President Barack Obama signed ...,0
3,"President Donald Trump on Wednesday declared ""...",0
4,Republicans and Democrats on the exclusive com...,0
...,...,...
57152,It's unclear exactly when it happened (the BBG...,2
57153,Horse trading is the oxygen of politics; it is...,2
57154,Having plunged from over 10% to just 1% in the...,2
57155,Officials at Saudi Aramco believe that Iran us...,2


In [7]:
#find out unique Reliability labels type
totalData.Reliability.unique()

array([0, 2])

In [8]:
#Drop off mixed reliability(label=1) type
totalData = totalData[totalData.Reliability != 1]

In [9]:
#find out unique Reliability labels type
totalData.Reliability.unique()

array([0, 2])

# Cleaning dataset

In [10]:
totalData = totalData.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
totalData.content = totalData.content.apply(clean_text)
totalData.content = totalData.content.str.replace('\d+', '')

In [35]:
totalData.sample(10)

Unnamed: 0,content,Reliability
32637,the governments flagship eu settlementstatus s...,0
12968,uc san diego professor shirley mengs laborator...,0
16019,footballer collin martin who came out as gay l...,0
25692,torrential rain forces authorities to open ros...,0
30719,atletico madrid and the spanish authorities ar...,0
32800,at the moment it seems as if you cant go onlin...,0
46948,january am est,2
52730,turkish leader recep tayyip erdogan has once a...,2
43756,hoda muthana was born in the us in as a yearo...,0
3093,there should be a strong conservative influenc...,0


# Perform test train split

In [12]:
# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(totalData['content'], totalData['Reliability'], test_size=0.20)

# Develop Unigram and Bigram Tf-Idf feature vectors from data

In [13]:
# unigram level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)
tfidf_vect.fit(totalData['content'])
xtrain_tfidf_unigram =  tfidf_vect.transform(train_x)
xvalid_tfidf_unigram =  tfidf_vect.transform(valid_x)

# bigram level tf-idf (bigram in this case)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=None)
tfidf_vect_ngram.fit(totalData['content'])
xtrain_tfidf_bigram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_bigram =  tfidf_vect_ngram.transform(valid_x)


In [23]:
import pickle
with open('/scratch/tathagataraha/nela-gt-tfidf-uni.pkl', 'wb+') as f:
    pickle.dump(tfidf_vect, f)
with open('/scratch/tathagataraha/nela-gt-tfidf-bi.pkl', 'wb+') as f:
    pickle.dump(tfidf_vect_ngram, f)

# Build Model functions

In [30]:
def train_model(classifier, feature_vector_train, label,  feature_vector_valid, valid_y,is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #print("In Validation Data",metrics.accuracy_score(predictions, valid_y))
    print("f1 score: ",f1_score(valid_y,predictions,average='weighted'))        
    return classifier, metrics.accuracy_score(valid_y,predictions)
    

# Base Line Model Used
## 1. Naive Bayes
## 2. Linear Classifier
## 3. Bagging
## 4. Boosting
## 5. SVM

#  Naive Bayes Model 

In [25]:
# Naive Bayes on Word Level TF IDF Vectors
print("For Unigram Tf-IDF features vectors using Naive Bayes")
model, accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram, valid_y)
print ("Accuracy: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
print("For Bigram Tf-IDF features vectors using Naive Bayes")
model, accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram, valid_y)
print ("Accuracy: ", accuracy)

For Unigram Tf-IDF features vectors using Naive Bayes
f1 score:  0.6913070889811321
Accuracy:  0.7824527641707488
For Bigram Tf-IDF features vectors using Naive Bayes
f1 score:  0.7026079809424893
Accuracy:  0.7874387683694891


# Linear Classifier

In [31]:
# Linear Classifier on Word Level TF IDF Vectors
print("For Unigram TF-Idf feature vectors using Logistic Regression")
model, accuracy = train_model(linear_model.LogisticRegression(max_iter=500), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram, valid_y)
print("accuracy: ", accuracy)
with open('/scratch/tathagataraha/nela-gt-linear-uni.pkl', 'wb+') as f:
    pickle.dump(model, f)
# Linear Classifier on Ngram Level TF IDF Vectors
print("For Biigram TF-Idf feature vectors using Logistic Regression")
model, accuracy = train_model(linear_model.LogisticRegression(max_iter=500),  xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram, valid_y)
print("Accuracy: ", accuracy)
with open('/scratch/tathagataraha/nela-gt-linear-bi.pkl', 'wb+') as f:
    pickle.dump(model, f)

For Unigram TF-Idf feature vectors using Logistic Regression
f1 score:  0.8676557517790607
accuracy:  0.8788488453463961
For Biigram TF-Idf feature vectors using Logistic Regression
f1 score:  0.8122216671036123
Accuracy:  0.8449090272918125


In [34]:
model.predict(tfidf_vect_ngram.transform(['i am a good boy', 'i am a bad boy']))

array([0, 0])

# Bagging Model

In [27]:
# RF on Word Level TF IDF Vectors
print("For Unigram Tf-Idf feature vectors using Random Forest Classifier")
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram, valid_y)
print ("Accuracy: ", accuracy)

# RF on ngram Level TF IDF Vectors
print("For Bigram Tf-Idf feature vectors using Random Forest Classifier")
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram, valid_y)
print ("Accuracy: ", accuracy)

For Unigram Tf-Idf feature vectors using Random Forest Classifier
f1 score:  0.7824865699144518
Accuracy:  0.8279391182645206
For Bigram Tf-Idf feature vectors using Random Forest Classifier


KeyboardInterrupt: 

# Boosting Model

In [49]:
print("For Unigram Tf-Idf feature vectors using Extreme Gradient Boosting")
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_unigram.tocsc(), train_y, xvalid_tfidf_unigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

# Extereme Gradient Boosting on ngram Level TF IDF Vectors
print("For Bigram Tf-Idf feature vectors using Extreme Gradient Boosting")
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_bigram.tocsc(), train_y, xvalid_tfidf_bigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

For Unigram Tf-Idf feature vectors using Extreme Gradient Boosting
f1 score:  0.8889110797087082
Accuracy:  0.8964310706787963
For Bigram Tf-Idf feature vectors using Extreme Gradient Boosting
f1 score:  0.8775083946497313
Accuracy:  0.8889958012596221


# SVM Model

In [50]:
print("For Unigram Tf-Idf feature vectors using SVM")
#SVM Model on Unigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf_unigram.tocsc(), train_y, xvalid_tfidf_unigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

# SVM Model on Bigram TF-IDF
print("For Bigram Tf-Idf feature vectors using SVM")
accuracy = train_model(svm.SVC(), xtrain_tfidf_bigram.tocsc(), train_y, xvalid_tfidf_bigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

For Unigram Tf-Idf feature vectors using SVM
f1 score:  0.9001589355399741
Accuracy:  0.9065780265920224
For Bigram Tf-Idf feature vectors using SVM
f1 score:  0.84919268919277
Accuracy:  0.8704513645906228


# Using Tf-idf features of Title and the Body of data and predictions using Multi Layer Perceptron

# Read Data again

In [2]:
data = pd.read_csv('../datasets/nela10.csv')

# Keep only title,body and reliability as columns 

In [3]:
data = data.drop(['id','date','source','author','url','published','published_utc','collection_utc'],axis=1)

# Now, how it looks like

In [4]:
data

Unnamed: 0,title,content,Reliability
0,"Who is Stacey Abrams, the Democrat who will re...","Last November, Stacey Abrams, a Democrat who r...",0
1,Ban government shutdowns? Some Republicans and...,As lawmakers scramble to negotiate a border se...,0
2,Democrats renew push for equal pay for equal work,Ten years after President Barack Obama signed ...,0
3,Trump attacks intel chiefs after they contradi...,"President Donald Trump on Wednesday declared ""...",0
4,Border security battle begins as Congress grap...,Republicans and Democrats on the exclusive com...,0
...,...,...,...
57152,Hong Kong Officer Faces Death Threats After Fi...,It's unclear exactly when it happened (the BBG...,2
57153,"Enough ""Quid Pro Quo"" Gaslighting!",Horse trading is the oxygen of politics; it is...,2
57154,"""Born For This? I Don't Think So"" - Trump Mock...",Having plunged from over 10% to just 1% in the...,2
57155,How Iran Used Google To Disrupt 5% Of Global O...,Officials at Saudi Aramco believe that Iran us...,2


# Drop off mixed reliability types

In [5]:
data = data[data.Reliability != 1]

In [6]:
#find out unique Reliability labels type
data.Reliability.unique()

array([0, 2])

# Clean data 

In [8]:
data = data.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
data.content = data.content.apply(clean_text)
data.content = data.content.str.replace('\d+', '')


In [13]:
data['title']=data['title'].astype(str)

In [14]:
data.title = data.title.apply(clean_text)
data.title = data.title.str.replace('\d+', '')

In [16]:
#Cleaned data
data

Unnamed: 0,title,content,Reliability
0,who is stacey abrams the democrat who will res...,last november stacey abrams a democrat who ran...,0
1,ban government shutdowns some republicans and ...,as lawmakers scramble to negotiate a border se...,0
2,democrats renew push for equal pay for equal work,ten years after president barack obama signed ...,0
3,trump attacks intel chiefs after they contradi...,president donald trump on wednesday declared t...,0
4,border security battle begins as congress grap...,republicans and democrats on the exclusive com...,0
...,...,...,...
57152,hong kong officer faces death threats after fi...,its unclear exactly when it happened the bbg s...,2
57153,enough quid pro quo gaslighting,horse trading is the oxygen of politics it is ...,2
57154,born for this i dont think so trump mocks beto...,having plunged from over to just in the predic...,2
57155,how iran used google to disrupt of global oil...,officials at saudi aramco believe that iran us...,2


# Developing Tf-Idf features for title and content

In [19]:
# unigram level tf-idf
tfidf_vect1 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)
tfidf_vect1.fit(data['content'])
tfidf_vect2 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)
tfidf_vect2.fit(data['title'])
content_tfidf_unigram =  tfidf_vect1.transform(data['content'])
title_tfidf_unigram = tfidf_vect2.transform(data['title'])

# bigram level tf-idf (bigram in this case)
tfidf_vect_ngram1 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=None)
tfidf_vect_ngram1.fit(data['content'])
tfidf_vect_ngram2 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=None)
tfidf_vect_ngram2.fit(data['title'])
content_tfidf_bigram =  tfidf_vect_ngram1.transform(data['content'])
title_tfidf_bigram = tfidf_vect_ngram2.transform(data['title'])

In [22]:
#Check dimenstions
print(content_tfidf_unigram.shape)
print(title_tfidf_unigram.shape)
print(content_tfidf_bigram.shape)
print(title_tfidf_bigram.shape)

(57157, 737804)
(57157, 43040)
(57157, 6115724)
(57157, 311055)


In [25]:
fullData_unigram = np.empty((57157,780844))
fullData_bigram = np.empty((57157,6426779))

MemoryError: Unable to allocate 333. GiB for an array with shape (57157, 780844) and data type float64