# Fake News Detection

## Importing the libraries

In [76]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re 


## Importing the dataset

In [55]:
true_df=pd.read_csv('H:\ML\Git\Fake-News-Detection\data\True.csv')
fake_df=pd.read_csv('H:\ML\Git\Fake-News-Detection\data\Fake.csv')

## Modify the Dataset 

In [56]:
#Checking the columns
print(true_df.columns)
print(fake_df.columns)


Index(['title', 'text', 'subject', 'date'], dtype='object')
Index(['title', 'text', 'subject', 'date'], dtype='object')


In [57]:
#ADD True/False column to the dataframes
true_df['True/Fake']=1
fake_df['True/Fake']=0




In [58]:
#Dropping the irrelevant columns
true_df.drop(['text','subject','date'],axis=1, inplace=True)
fake_df.drop(['text','subject','date'],axis=1, inplace=True)
print(true_df.columns)
print(fake_df.columns)

Index(['title', 'True/Fake'], dtype='object')
Index(['title', 'True/Fake'], dtype='object')


## Merge and Shuffle the datasets

In [59]:
#Merge
dataset=pd.concat([true_df,fake_df],axis=0,ignore_index=True)



In [60]:
dataset.head()

Unnamed: 0,title,True/Fake
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,1


In [61]:
dataset.tail()

Unnamed: 0,title,True/Fake
44893,McPain: John McCain Furious That Iran Treated ...,0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,0
44896,How to Blow $700 Million: Al Jazeera America F...,0
44897,10 U.S. Navy Sailors Held by Iranian Military ...,0


In [67]:
#Shuffling the dataset
dataset=dataset.sample(frac=1,random_state=69).reset_index(drop=True)


In [68]:
dataset.head()

Unnamed: 0,title,True/Fake
0,No Irish border deal before EU trade agreement...,1
1,Exclusive: Trump team seeks names of officials...,1
2,"Trump, Clinton blast each other on character; ...",1
3,THE CARNAGE AND THE KINDNESS Of Good Samaritan...,0
4,WATCH: Barbra Streisand BURNS Donald Trump On...,0


In [69]:
dataset.tail()

Unnamed: 0,title,True/Fake
44893,"FEDS, CA STATE OFFICIALS IGNORED OROVILLE DAM ...",0
44894,SHOCKING VIDEO: Congressman Caught In Meltdown...,0
44895,Trump keeping 'open mind' on pulling out of cl...,1
44896,OOPS: Conservatives Lose BIGLY In UK Election...,0
44897,U.N. judicial rights expert decries Poland's '...,1


In [70]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      44898 non-null  object
 1   True/Fake  44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


## Cleaning the texts

In [82]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(0, len(dataset)):
    title = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])
    title = title.lower()
    title = title.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')  # Keep "not" if needed
    title = [lemmatizer.lemmatize(word) for word in title if word not in set(all_stopwords)]
    title = ' '.join(title)
    corpus.append(title)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hasin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hasin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hasin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [83]:
print(corpus[0:10])

['irish border deal eu trade agreement british minister', 'exclusive trump team seek name official working counter violent extremism', 'trump clinton blast character clinton rise poll', 'carnage kindness good samaritan london terror attack video', 'watch barbra streisand burn donald trump stage concert', 'famous rhode island dancing cop fired protesting cop hating terror group video', 'crucial vote hillary clinton fails rally woman history making bid', 'trump refugee ban solution problem turkey pm say', 'trump warns rogue regime north korea grave danger', 'australia police arrest man accused plotting nye attack melbourne']


## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [109]:
len(X[0])

4000

## Splitting the dataset into the Training set and Test set

In [110]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Logistic Regression

In [118]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train,y_train)

In [119]:
pred_lr=LR.predict(X_test)


In [122]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, pred_lr)
print(cm)
accuracy_score(y_test, pred_lr)

[[4410  262]
 [ 220 4088]]


0.946325167037862

##  Naive Bayes 

In [111]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [124]:
pred_NB = classifier.predict(X_test)
print(np.concatenate((pred_NB.reshape(len(pred_NB),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 1]
 [0 1]
 ...
 [1 1]
 [1 1]
 [0 0]]


In [126]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, pred_NB)
print(cm)
accuracy_score(y_test, pred_NB)

[[3808  864]
 [ 238 4070]]


0.8772828507795101

## Decision Tree