In [61]:
import pandas as pd
import numpy as np
import time
import eli5

np.random.seed(1234)

In [62]:
news_train_df = pd.read_csv('Preprocessed_Train.csv')
news_train_df.head(2)

Unnamed: 0,Text,Label
0,Report War Looms Hundreds American Troops CIA ...,0
1,I walked preview artist Pedro Reyes Creative T...,1


In [63]:
news_valid_df = pd.read_csv('Preprocessed_Valid.csv')
news_valid_df.head(2)

Unnamed: 0,Text,Label
0,Monsantos Roundup Herbicide Toxic To Life In P...,0
1,Obama Support Gives Cop Serious Attitude Immed...,0


In [64]:
news_test_df = pd.read_csv('Preprocessed_Test.csv')
news_test_df.head(2)

Unnamed: 0,Text,Label
0,The son Louisiana man whose father shot killed...,1
1,Copies William Shakespeare first four books du...,1


In [65]:
news_train, label_train = news_train_df['Text'], news_train_df['Label']
news_valid, label_valid = news_valid_df['Text'], news_valid_df['Label']
news_test, label_test = news_test_df['Text'], news_test_df['Label']

In [66]:
print(len(news_train), len(label_train))
print(len(news_valid), len(label_valid))
print(len(news_test), len(label_test))

63616 63616
15905 15905
7011 7011


In [67]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.pipeline import Pipeline

In [8]:
#Naive-Bayes classification
pipe1 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', MultinomialNB())])

In [9]:
start_time = time.time()

print("Fitting started...")

model_nb = pipe1.fit(news_train, label_train)

print("Fitting took")
print("--- %s seconds ---" % (time.time() - start_time))

Fitting started...
Fitting took
--- 21.142714023590088 seconds ---


In [10]:
nb_pred = model_nb.predict(news_valid)
target = label_valid

print("Accuracy of Naive Bayes Classifier: {}%".format(round(accuracy_score(target, nb_pred)*100,2)))
print("\nConfusion Matrix of Naive Bayes Classifier:\n")
print(confusion_matrix(target, nb_pred))
print("\nClassification Report of Naive Bayes Classifier:\n")
print(classification_report(target, nb_pred, digits=4))

Accuracy of Naive Bayes Classifier: 90.75%

Confusion Matrix of Naive Bayes Classifier:

[[6611 1223]
 [ 249 7822]]

Classification Report of Naive Bayes Classifier:

              precision    recall  f1-score   support

           0     0.9637    0.8439    0.8998      7834
           1     0.8648    0.9691    0.9140      8071

    accuracy                         0.9075     15905
   macro avg     0.9142    0.9065    0.9069     15905
weighted avg     0.9135    0.9075    0.9070     15905



In [11]:
nb_pred = model_nb.predict(news_test)

target = label_test

print("Accuracy of Naive Bayes Classifier: {}%".format(round(accuracy_score(target, nb_pred)*100,2)))
print("\nConfusion Matrix of Naive Bayes Classifier:\n")
print(confusion_matrix(target, nb_pred))
print("\nClassification Report of Naive Bayes Classifier:\n")
print(classification_report(target, nb_pred, digits=4))

Accuracy of Naive Bayes Classifier: 88.39%

Confusion Matrix of Naive Bayes Classifier:

[[1855  387]
 [ 427 4342]]

Classification Report of Naive Bayes Classifier:

              precision    recall  f1-score   support

           0     0.8129    0.8274    0.8201      2242
           1     0.9182    0.9105    0.9143      4769

    accuracy                         0.8839      7011
   macro avg     0.8655    0.8689    0.8672      7011
weighted avg     0.8845    0.8839    0.8842      7011



In [68]:
print(pipe1)
clf = pipe1['model']
vec = pipe1['vect']

eli5.show_weights(clf, vec=vec, top=20)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model', MultinomialNB())])


In [12]:
#Support Vector classification
pipe2 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', LinearSVC())])

In [13]:
print("Fitting started...")

model_svc = pipe2.fit(news_train, label_train)

print("Fitting took")
print("--- %s seconds ---" % (time.time() - start_time))

Fitting started...
Fitting took
--- 50.27299618721008 seconds ---


In [14]:
svc_pred = model_svc.predict(news_valid)
target = label_valid

print("Accuracy of SVM Classifier: {}%".format(round(accuracy_score(target, svc_pred)*100,2)))
print("\nConfusion Matrix of SVM Classifier:\n")
print(confusion_matrix(target, svc_pred))
print("\nClassification Report of SVM Classifier:\n")
print(classification_report(target, svc_pred, digits=4))

Accuracy of SVM Classifier: 97.37%

Confusion Matrix of SVM Classifier:

[[7640  194]
 [ 224 7847]]

Classification Report of SVM Classifier:

              precision    recall  f1-score   support

           0     0.9715    0.9752    0.9734      7834
           1     0.9759    0.9722    0.9741      8071

    accuracy                         0.9737     15905
   macro avg     0.9737    0.9737    0.9737     15905
weighted avg     0.9737    0.9737    0.9737     15905



In [15]:
svc_pred = model_svc.predict(news_test)
target = label_test

print("Accuracy of SVM Classifier: {}%".format(round(accuracy_score(target, svc_pred)*100,2)))
print("\nConfusion Matrix of SVM Classifier:\n")
print(confusion_matrix(target, svc_pred))
print("\nClassification Report of SVM Classifier:\n")
print(classification_report(target, svc_pred, digits=4))

Accuracy of SVM Classifier: 90.83%

Confusion Matrix of SVM Classifier:

[[1928  314]
 [ 329 4440]]

Classification Report of SVM Classifier:

              precision    recall  f1-score   support

           0     0.8542    0.8599    0.8571      2242
           1     0.9340    0.9310    0.9325      4769

    accuracy                         0.9083      7011
   macro avg     0.8941    0.8955    0.8948      7011
weighted avg     0.9085    0.9083    0.9084      7011



In [125]:
print(pipe2)
clf_svc = pipe2['model']
vec_svc = pipe2['vect']

eli5.show_weights(clf_svc, vec=vec_svc, top=30)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model', LinearSVC())])


Weight?,Feature
+5.624,but
+4.911,related
+4.317,it
+4.204,photograph
+4.076,editor
+3.222,taiwan
+3.202,mr
… 82116 more positive …,… 82116 more positive …
… 72355 more negative …,… 72355 more negative …
-2.920,entire


In [70]:
news_test_df['predicted_label_svc'] = svc_pred

misclassified_examples_true_as_fake = news_test_df[(news_test_df['Label']!=news_test_df['predicted_label_svc'])&(news_test_df['Label']==1)]
misclassified_examples_fake_as_true = news_test_df[(news_test_df['Label']!=news_test_df['predicted_label_svc'])&(news_test_df['Label']==0)]

In [71]:
misclassified_examples_true_as_fake.head(3)

Unnamed: 0,Text,Label,predicted_label_svc
2,Debt 20 000 Source College credit cards Estima...,1,0
93,After NBC coverage Olympic closing ceremony Ri...,1,0
151,The atmosphere aura surrounding Mexico quadren...,1,0


In [72]:
misclassified_examples_fake_as_true.head(3)

Unnamed: 0,Text,Label,predicted_label_svc
4772,In aftermath Scotlands vote referendum becomin...,0,1
4785,Fresh 2012 Republican National Convention Rep....,0,1
4789,As preparations get way first Republican Presi...,0,1


In [16]:
#Logistic regression classification
pipe3 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', LogisticRegression())])

In [17]:
print("Fitting started...")

model_lr = pipe3.fit(news_train, label_train)

print("Fitting took")
print("--- %s seconds ---" % (time.time() - start_time))

Fitting started...
Fitting took
--- 87.97050619125366 seconds ---


In [18]:
lr_pred = model_lr.predict(news_valid)
target = label_valid

print("Accuracy of Logistic Regression Classifier: {}%".format(round(accuracy_score(target, lr_pred)*100,2)))
print("\nConfusion Matrix of Logistic Regression Classifier:\n")
print(confusion_matrix(target, lr_pred))
print("\nCLassification Report of Logistic Regression Classifier:\n")
print(classification_report(target, lr_pred, digits=4))

Accuracy of Logistic Regression Classifier: 96.57%

Confusion Matrix of Logistic Regression Classifier:

[[7583  251]
 [ 295 7776]]

CLassification Report of Logistic Regression Classifier:

              precision    recall  f1-score   support

           0     0.9626    0.9680    0.9652      7834
           1     0.9687    0.9634    0.9661      8071

    accuracy                         0.9657     15905
   macro avg     0.9656    0.9657    0.9657     15905
weighted avg     0.9657    0.9657    0.9657     15905



In [19]:
lr_pred = model_lr.predict(news_test)
target = label_test

print("Accuracy of Logistic Regression Classifier: {}%".format(round(accuracy_score(target, lr_pred)*100,2)))
print("\nConfusion Matrix of Logistic Regression Classifier:\n")
print(confusion_matrix(target, lr_pred))
print("\nCLassification Report of Logistic Regression Classifier:\n")
print(classification_report(target, lr_pred, digits=4))

Accuracy of Logistic Regression Classifier: 89.72%

Confusion Matrix of Logistic Regression Classifier:

[[1918  324]
 [ 397 4372]]

CLassification Report of Logistic Regression Classifier:

              precision    recall  f1-score   support

           0     0.8285    0.8555    0.8418      2242
           1     0.9310    0.9168    0.9238      4769

    accuracy                         0.8972      7011
   macro avg     0.8798    0.8861    0.8828      7011
weighted avg     0.8982    0.8972    0.8976      7011



In [126]:
print(pipe3)
clf_lr = pipe3['model']
vec_lr = pipe3['vect']

eli5.show_weights(clf_lr, vec=vec_lr, top=30)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model', LogisticRegression())])


Weight?,Feature
+11.453,but
+10.255,mr
+9.095,it
+8.252,related
+7.589,photograph
+6.956,taiwan
+6.326,says
+6.213,the
+6.094,ms
+5.969,guardian


In [124]:
# compare against SVC earlier

eli5.show_weights(clf_svc, vec=vec_svc, top=30)

Weight?,Feature
+5.624,but
+4.911,related
+4.317,it
+4.204,photograph
+4.076,editor
+3.222,taiwan
+3.202,mr
… 82116 more positive …,… 82116 more positive …
… 72355 more negative …,… 72355 more negative …
-2.920,entire


In [74]:
news_test_df['predicted_label_lr'] = lr_pred

misclassified_examples_true_as_fake = news_test_df[(news_test_df['Label']!=news_test_df['predicted_label_lr'])&(news_test_df['Label']==1)]
misclassified_examples_fake_as_true = news_test_df[(news_test_df['Label']!=news_test_df['predicted_label_lr'])&(news_test_df['Label']==0)]

In [75]:
misclassified_examples_true_as_fake.head()

Unnamed: 0,Text,Label,predicted_label_svc,predicted_label_lr
2,Debt 20 000 Source College credit cards Estima...,1,0,0
8,The FBI arrested National Security Agency cont...,1,1,0
11,Name Pamela Anderson. Age 49. Occupation Defin...,1,1,0
27,When Elon Musk outlining plans use massive roc...,1,1,0
48,Jurors awarded University Virginia administrat...,1,1,0


In [76]:
misclassified_examples_fake_as_true.head()

Unnamed: 0,Text,Label,predicted_label_svc,predicted_label_lr
4772,In aftermath Scotlands vote referendum becomin...,0,1,1
4785,Fresh 2012 Republican National Convention Rep....,0,1,1
4794,Former Secretary State Hillary Clinton said to...,0,0,1
4795,Speaker House John Boehner R Ohio said disappo...,0,0,1
4800,In first term office President Barack Obama br...,0,0,1


In [83]:
# visualise an example where SVC labelled a True article as Fake

eli5.show_prediction(clf_svc, misclassified_examples_true_as_fake['Text'].values[0], vec=vec_svc)

Contribution?,Feature
12.214,Highlighted in text (sum)
-0.466,<BIAS>


In [84]:
# visualise an example where LR labelled True article as Fake

eli5.show_prediction(clf_lr, misclassified_examples_true_as_fake['Text'].values[0], vec=vec_lr)

Contribution?,Feature
18.836,Highlighted in text (sum)
-1.219,<BIAS>


In [86]:
# visualise an example where SVC labelled a Fake article as True

eli5.show_prediction(clf_svc, misclassified_examples_fake_as_true['Text'].values[0], vec=vec_svc)

Contribution?,Feature
20.207,Highlighted in text (sum)
-0.466,<BIAS>


In [87]:
# visualise an example where LR labelled Fake article as True

eli5.show_prediction(clf_lr, misclassified_examples_fake_as_true['Text'].values[0], vec=vec_lr)

Contribution?,Feature
60.519,Highlighted in text (sum)
-1.219,<BIAS>


In [91]:
# visualise an example where SVC labelled True article as True

eli5.show_prediction(clf_svc, misclassified_examples_true_as_fake['Text'].values[2], vec=vec_svc)

Contribution?,Feature
25.131,Highlighted in text (sum)
-0.466,<BIAS>


In [123]:
# visualise an example where LR labelled True article as Fake

eli5.show_prediction(clf_lr, misclassified_examples_true_as_fake['Text'].values[2], top=(5,5), vec=vec_lr)

Contribution?,Feature
+52.115,Highlighted in text (sum)
… 85 more positive …,… 85 more positive …
… 80 more negative …,… 80 more negative …


In [95]:
help(eli5.show_prediction)

Help on function show_prediction in module eli5.ipython:

show_prediction(estimator, doc, **kwargs)
    Return an explanation of estimator prediction
    as an IPython.display.HTML object. Use this function
    to show information about classifier prediction in IPython.
    
    :func:`show_prediction` accepts all
    :func:`eli5.explain_prediction` arguments and all
    :func:`eli5.formatters.html.format_as_html`
    keyword arguments, so it is possible to get explanation and
    customize formatting in a single call.
    
    
    If :func:`explain_prediction` returns an :class:`base.Explanation` object with
    the ``image`` attribute not set to None, i.e. if explaining image based models,
    then formatting is dispatched to an image display implementation, 
    and image explanations are shown in an IPython cell.
    Extra keyword arguments are passed to :func:`eli5.format_as_image`.
    
    Note that this image display implementation 
    requires ``matplotlib`` and ``Pillow`` a