# Classifying fake news using supervised learning with NLP

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('fake_or_real_news.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Creating a sparse text vectorizer that can be used to train and test a simple supervised model

In [6]:
# Creating a series to store the labels
y = data.label
X_train, X_test, y_train, y_test = train_test_split(data['text'], y, test_size=0.33, random_state=53)

# Initializing a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words='english')

# Transforming the training data using only the 'text' column values 
count_train = count_vectorizer.fit_transform(X_train)

# Transforming the test data
count_test = count_vectorizer.transform(X_test)

# Printing the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


### Creating tf-idf vectors 

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer 

# Initializing a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transforming the training data and test data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Printing the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# Printing the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Converting the word vectors to dataframes

In [8]:
# Creating the CountVectorizer DataFrame
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Creating the TfidfVectorizer DataFrame
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

print(count_df.head())
print(tfidf_df.head())

# Calculating the difference in columns: difference
difference = set(tfidf_df.columns) - set(count_df.columns)
print(difference)

# Checking whether the DataFrames are equal
print(count_df.equals(tfidf_df))

   00  000  0000  00000031  000035  00006  0001  0001pt  000ft  000km  ...  \
0   0    0     0         0       0      0     0       0      0      0  ...   
1   0    0     0         0       0      0     0       0      0      0  ...   
2   0    0     0         0       0      0     0       0      0      0  ...   
3   0    0     0         0       0      0     0       0      0      0  ...   
4   0    0     0         0       0      0     0       0      0      0  ...   

   حلب  عربي  عن  لم  ما  محاولات  من  هذا  والمرضى  ยงade  
0    0     0   0   0   0        0   0    0        0      0  
1    0     0   0   0   0        0   0    0        0      0  
2    0     0   0   0   0        0   0    0        0      0  
3    0     0   0   0   0        0   0    0        0      0  
4    0     0   0   0   0        0   0    0        0      0  

[5 rows x 56922 columns]
    00  000  0000  00000031  000035  00006  0001  0001pt  000ft  000km  ...  \
0  0.0  0.0   0.0       0.0     0.0    0.0   0.0     0.0    

### Training and testing the "fake news" model with CountVectorizer

In [9]:
from sklearn import metrics 
from sklearn.naive_bayes import MultinomialNB

# Instantiating a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Fitting the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Creating the predicted tags
pred = nb_classifier.predict(count_test)

# Calculating the accuracy score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculating the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)


0.893352462936394
[[ 865  143]
 [  80 1003]]


### Training and testing the "fake news" model with TfidfVectorizer

In [10]:
nb_classifier = MultinomialNB()

nb_classifier.fit(tfidf_train, y_train)

pred = nb_classifier.predict(tfidf_test)

score = metrics.accuracy_score(y_test, pred)
print(score)

cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)

0.8565279770444764
[[ 739  269]
 [  31 1052]]


### Improving the model

Here we will test a few different alpha levels using the Tfidf vectors to determine if there is a better performing combination.

In [12]:
import numpy as np
alphas = np.arange(0,1,0.1)

def train_and_predict(alpha):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    return score

for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.8813964610234337

Alpha:  0.1
Score:  0.8976566236250598

Alpha:  0.2
Score:  0.8938307030129125

Alpha:  0.30000000000000004
Score:  0.8900047824007652

Alpha:  0.4
Score:  0.8857006217120995

Alpha:  0.5
Score:  0.8842659014825442

Alpha:  0.6000000000000001
Score:  0.874701099952176

Alpha:  0.7000000000000001


  'setting alpha = %.1e' % _ALPHA_MIN)


Score:  0.8703969392635102

Alpha:  0.8
Score:  0.8660927785748446

Alpha:  0.9
Score:  0.8589191774270684



### Inspecting the model

Now that we have built a "fake news" classifier, we'll investigate what it has learned. We can map the important vector weights back to actual words using some simple inspection techniques.

In [13]:
# Getting the class labels
class_labels = nb_classifier.classes_

# Extracting the features
feature_names = tfidf_vectorizer.get_feature_names()

# Zipping the feature names together with the coefficient array and sorting by weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

#first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

#second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])

FAKE [(-11.316312804238807, '0000'), (-11.316312804238807, '000035'), (-11.316312804238807, '0001'), (-11.316312804238807, '0001pt'), (-11.316312804238807, '000km'), (-11.316312804238807, '0011'), (-11.316312804238807, '006s'), (-11.316312804238807, '007'), (-11.316312804238807, '007s'), (-11.316312804238807, '008s'), (-11.316312804238807, '0099'), (-11.316312804238807, '00am'), (-11.316312804238807, '00p'), (-11.316312804238807, '00pm'), (-11.316312804238807, '014'), (-11.316312804238807, '015'), (-11.316312804238807, '018'), (-11.316312804238807, '01am'), (-11.316312804238807, '020'), (-11.316312804238807, '023')]
REAL [(-7.742481952533027, 'states'), (-7.717550034444668, 'rubio'), (-7.703583809227384, 'voters'), (-7.654774992495461, 'house'), (-7.649398936153309, 'republicans'), (-7.6246184189367, 'bush'), (-7.616556675728881, 'percent'), (-7.545789237823644, 'people'), (-7.516447881078008, 'new'), (-7.448027933291952, 'party'), (-7.411148410203476, 'cruz'), (-7.410910239085596, 'st