In [21]:
# General packages and dictionary analysis
import os
import tarfile
import bz2
import urllib.request
import re
import pickle
import nltk
import eli5
import joblib
import requests
import random
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
import matplotlib.pyplot as plt

# Supervised text classification
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer)
from sklearn.linear_model import (
    LogisticRegression)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import (
    make_pipeline, Pipeline)
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split
from sklearn.compose import make_column_transformer
from sklearn import metrics
import joblib
import eli5
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

In [11]:
random.seed(10)

In [57]:
df = pd.read_csv("WELFake_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### We observe that, we have large number of observations so we drop the NA values from the dataset

In [15]:
df = df.dropna()

In [14]:
df.label.value_counts()

1    37106
0    35028
Name: label, dtype: int64

### Our target values are balanced, so ready to start the classification pipeline

# Naive Bayes Algorithm with Count Vector Features

#### 80% train test split

In [6]:
y = df.label
X = df.text
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle = True)

In [7]:
vectorizer = CountVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

rep=metrics.classification_report(y_test, y_pred)
print(rep)

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      7081
           1       0.91      0.87      0.89      7227

    accuracy                           0.89     14308
   macro avg       0.89      0.89      0.89     14308
weighted avg       0.89      0.89      0.89     14308



### Although Naive Bayes is a very simple algorithm we have good classification accuracies.

# Logistic Regression with TF-IDF Features

#### We make test(60%) - validation (20%) - test (20%) split

In [29]:
y = df.label
X = df.text
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1) #test set

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # validation set

#### Using 2 word ngrams to capture the information from group of words.

In [52]:
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)
X_train_text = text_transformer.fit_transform(X_train)
X_val_text = text_transformer.transform(X_val)
X_test_text = text_transformer.transform(X_test)

In [31]:
X_train_text.shape, X_val_text.shape

((42921, 150000), (14308, 150000))

In [32]:
logit = LogisticRegression(multi_class='multinomial', random_state=42, n_jobs=4)

#### We perform 5-fold cross validation to find the best fit.

In [43]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [44]:
cv_results = cross_val_score(logit, X_train_text, y_train, cv=skf, scoring='f1_micro')

#### It's nice to see that cross-validation is more or less stable across folds.

In [50]:
cv_results, cv_results.mean()

(array([0.9436226 , 0.9471109 , 0.95013979, 0.94664492, 0.94780988]),
 0.9470656192311846)

In [48]:
%%time
logit.fit(X_train_text, y_train)

Wall time: 8.55 s


LogisticRegression(multi_class='multinomial', n_jobs=4, random_state=42)

### Logistic regression using tf-idf features achieved around 95 percent accuracy. We pick it as our final model.

In [54]:
y_pred = logit.predict(X_test_text)

rep=metrics.classification_report(y_test, y_pred)
print(rep)

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6969
           1       0.95      0.97      0.96      7339

    accuracy                           0.95     14308
   macro avg       0.95      0.95      0.95     14308
weighted avg       0.95      0.95      0.95     14308



#### We use Eli5 feature importance package to interpret the most important word/group of words. 
#### Positive weights indicate that the score got worse when the feature was removed (i.e. score decreased). 

#### Therefore the most important words that are predicting the news as fake or not are featured image, image, twitter and hillary.
#### The fact that the name Hillary is one of the top predictors might show that there could be number of fake news going about Hillary Clinton

#### Also articles sourcing images could be associated with not fake news. Furthermore, twitter sources could be related with the fake news.

In [49]:
eli5.show_weights(estimator=logit, 
                  feature_names= list(text_transformer.get_feature_names()),
                 top=(50, 5))



Weight?,Feature
+5.745,featured image
+5.360,image
+5.277,featured
+4.439,twitter com
+4.204,hillary
+3.667,getty
+3.637,com
+3.461,getty images
+3.415,october
+3.367,read
