In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
#Read the data
first = pd.read_csv('fake_or_real_news.csv')
second = pd.read_csv('fake_news_manual.csv')

boom_live = pd.read_csv('dataset/boomlive_fake.csv')
the_hindu_real = pd.read_csv('dataset/the_hindu_real.csv')


df = pd.concat([first, second, boom_live, the_hindu_real])
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
df.shape

(14083, 4)

In [4]:
labels=df['label']
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

#  IMPLEMENTATION

### Out of the four mentioned fields in the data set first we use title as the only source of information followed by text for using Naïve Bayes classifier. Titles of the news articles were retrieved from the dataset and a dataframe was created. 

In [5]:
title = df['title']

In [6]:
# Training Data - word to vector

In [7]:
# Instantiate the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(ngram_range=(1,5))

In [8]:
# Seperate the data into traning and testing 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(title, labels, test_size=0.2, random_state=7)

In [9]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(x_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(x_test)

# print(training_data)
# print(testing_data)

In [10]:
# Make model and fit it
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
predictions = naive_bayes.predict(testing_data)
predictions

array(['REAL', 'FAKE', 'FAKE', ..., 'FAKE', 'REAL', 'REAL'], dtype='<U4')

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions, average="binary", pos_label="REAL")))
print('Recall score: ', format(recall_score(y_test, predictions, average="binary", pos_label="REAL")))
print('F1 score: ', format(f1_score(y_test, predictions, average="binary", pos_label="REAL")))

Accuracy score:  0.873269435569755
Precision score:  0.8469773299748111
Recall score:  0.9218642906100069
F1 score:  0.8828355759763703


# Implementation for text

In [13]:
text = df['text']

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(ngram_range=(1,5))

# Seperate the data into traning and testing 
from sklearn.model_selection import train_test_split
x_train_text,x_test_text,y_train_text,y_test_text=train_test_split(text, labels, test_size=0.2, random_state=7)

In [15]:
# Fit the training data and then return the matrix
training_data_text = count_vector.fit_transform(x_train_text)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data_text = count_vector.transform(x_test_text)

In [16]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data_text, y_train_text)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
predictions_text = naive_bayes.predict(testing_data_text)
predictions_text

array(['REAL', 'FAKE', 'FAKE', ..., 'REAL', 'REAL', 'REAL'], dtype='<U4')

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('Accuracy score: ', format(accuracy_score(y_test_text, predictions_text)))
print('Precision score: ', format(precision_score(y_test_text, predictions_text, average="binary", pos_label="REAL")))
print('Recall score: ', format(recall_score(y_test_text, predictions_text, average="binary", pos_label="REAL")))
print('F1 score: ', format(f1_score(y_test_text, predictions_text, average="binary", pos_label="REAL")))

Accuracy score:  0.9059282925097621
Precision score:  0.8807397959183674
Recall score:  0.9465387251542152
F1 score:  0.9124545754872811


# Checking Confusion Matrix

In [19]:
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix

In [20]:
# Confusion Matrix for title
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
(tn, fp, fn, tp)

(1115, 243, 114, 1345)

In [21]:
# Cofusion Matrix for text
tnt, fpt, fnt, tpt = confusion_matrix(y_test_text, predictions_text).ravel()
(tnt, fpt, fnt, tpt)

(1171, 187, 78, 1381)

# Making It generalized

In [22]:
title = df['title']
text = df['text']

In [23]:
# Instantiate the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(ngram_range=(1,1))
count_vector_text = CountVectorizer(ngram_range=(1,1))

In [24]:
# Seperate the data into traning and testing 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(title, labels, test_size=0.2, random_state=7)
x_train_text,x_test_text,y_train_text,y_test_text=train_test_split(text, labels, test_size=0.2, random_state=7)

In [25]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(x_train)
training_data_text = count_vector_text.fit_transform(x_train_text)

In [26]:
input_news = """

Just in | Maharashtra govt has withdrawn all cases against protesters who opposed metro carshed at Aarey. Maharashtra govt declares 800 acre of Aarey land as forest. Aarey metro carshed will be constructed in Kanjur Marg: CM Uddhav Thackeray. 


"""

In [27]:
testing_data = count_vector.transform([input_news])
testing_data_text = count_vector.transform([input_news])

# Making the model and fit it
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes_text = MultinomialNB()

# Fitting the model
naive_bayes.fit(training_data, y_train)
naive_bayes_text.fit(training_data_text, y_train_text)

# Predicting
predictions = naive_bayes.predict(testing_data)
print(predictions)
predictions_text = naive_bayes.predict(testing_data_text)
print(predictions_text)

['REAL']
['REAL']
