In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

fake_path = '/content/drive/MyDrive/Fake.csv'
true_path = '/content/drive/MyDrive/True.csv'
test_path = '/content/drive/MyDrive/test.csv'

fake = pd.read_csv(fake_path)
true = pd.read_csv(true_path)
test = pd.read_csv(test_path)


In [None]:
#adding class label to both files, fake=0 and true = 1

fake['class'] = 0
true['class'] = 1

fake.shape, true.shape




((23481, 5), (21417, 5))

In [None]:
#merging both files i.e fake and true

merged_data = pd.concat([fake, true], axis=0)
merged_data.head()


Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [None]:
merged_data.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [None]:
#dropping columns from both files i.e the merged one and the test one which will be used for final prediction

new_data = merged_data.drop(['title', 'subject', 'date'], axis = 1)
new_test= test.drop(['title', 'author'], axis = 1)

In [None]:
#checking null values in both files, test file contained null values, so dropping those rows
new_data.isnull().sum()
new_test.isnull().sum()

new_test.dropna(inplace = True)
new_test.isnull().sum()

id      0
text    0
dtype: int64

In [None]:
new_data = new_data.sample(frac = 1)
new_data.head()

Unnamed: 0,text,class
10668,James Comey testified this morning that Lorett...,0
9727,NFL reporter Jason LaCanfora updated football ...,0
6462,WASHINGTON (Reuters) - Repealing and replacing...,1
2452,"Even from the beginning of his hate-fueled, bi...",0
3488,The White House on Wednesday disclosed a group...,1


In [None]:
#removing index column, now index will start from 0,1....
new_data.reset_index(inplace = True)
new_data.drop(['index'], axis = 1, inplace = True)

new_test.reset_index(inplace = True)
new_test.drop(['index'], axis = 1, inplace = True)


In [None]:
new_test.head(10)

Unnamed: 0,id,text
0,20800,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...
2,20802,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"If at first you don’t succeed, try a different..."
4,20804,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
5,20805,Trump is USA's antique hero. Clinton will be n...
6,20806,"Sunday on NBC’s “Meet the Press,” House Minori..."
7,20807,You are here: Home / *Articles of the Bound* /...
8,20808,Urban Population Booms Will Make Climate Chang...
9,20809,don't we have the receipt?


In [None]:
#this function replaces these substrings with empty string

def words(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub("\\W", " ", text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return text

In [None]:
#applying this function to both datasets
new_data['text'] = new_data['text'].apply(words)

new_test['text'] = new_test['text'].apply(words)

In [None]:
new_data.head(10)

Unnamed: 0,text,class
0,james comey testified this morning that lorett...,0
1,nfl reporter jason lacanfora updated football ...,0
2,washington reuters repealing and replacing...,1
3,even from the beginning of his hate fueled bi...,0
4,the white house on wednesday disclosed a group...,1
5,republicans in oklahoma just took their war on...,0
6,we love this guy if you re on twitter and not...,0
7,abuja reuters the united states has formal...,1
8,by now it s pretty clear that bernie doesn t ...,0
9,washington reuters u s president donald t...,1


In [None]:
#x contains the text column and y contains labels

x = new_data['text']
y = new_data['class']

test_data = new_test['text']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
#this is in built function that uses the frequency of words to determine how relevant those words are
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)
test_data_v = vectorization.transform(test_data)

In [None]:
#predict using Logistic Regression
from sklearn.linear_model import LogisticRegression

In [None]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)


In [None]:
pred_lr = LR.predict(xv_test)
LR.score(xv_test, y_test)

0.9877951002227171

In [None]:
print(classification_report(y_test, pred_lr))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5849
           1       0.99      0.99      0.99      5376

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [None]:
#predict using Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)


In [None]:
pred_dt = DT.predict(xv_test)


In [None]:
DT.score(xv_test, y_test)

0.9951002227171493

In [None]:
#predict using svm
from sklearn import svm

In [None]:
#as the dataset was very large and was taking a long time, so took 1st 1000 rows only
new_x_train = xv_train[:1000]
new_y_train = y_train[:1000]

classifier = svm.SVC(kernel='linear')
classifier.fit(new_x_train, new_y_train)




pred_svm = classifier.predict(xv_test)


In [None]:
print(classifier.score(xv_test, y_test))

0.9663251670378619


In [None]:
#final prediction of our test file using Decision tree



final_prediction = DT.predict(test_data_v)
print(final_prediction[:500])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 