# Fake News Detector Machine Learning Model 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_recall_fscore_support
import re
import string


# Importing Function from script.py file
from scripts import word_drop

# Loading The Datasets

For this exploratory Pipeline, I am using two separated datasets I found on Google, one CSV file is the **true news dataset** and the other file is the ****fake news dataset****.

In [2]:
df_fake = pd.read_csv('Datasets/Public_Data/corona_fake.csv')
df_fake = df_fake.drop(['text', 'source', 'label'], axis=1)
df_fake.rename(columns = {'title':'Tweets'}, inplace = True)
df_fake = df_fake.dropna()
print(df_fake.shape)
df_fake.head(3)


(1082, 1)


Unnamed: 0,Tweets
0,Due to the recent outbreak for the Coronavirus...
5,CORONA UNMASKED: Chinese Intelligence Officer ...
9,Basic protective measures against the new coro...


In [3]:
df_true = pd.read_csv('Datasets/Old_Collected_Data/COVID-19-Truth.csv')
df_true = df_true.drop(['Tweet URL', 'Tweet ID', 'User ID', 'Unnamed: 0'], axis=1)
print(df_true.shape)
df_true.head(3)

(703, 1)


Unnamed: 0,Tweets
0,The global Covid-19 death toll hits a new grim...
1,The White House unveiled a plan to move the na...
2,Britain's Queen Elizabeth II cancels virtual e...


# Removing Empty Rows

# Adding Lables to each Dataset

In [4]:
df_fake["class"] = 0
df_true["class"] = 1

# Dataset Shapes

In [5]:
df_fake.shape, df_true.shape

((1082, 2), (703, 2))

# Taking some rows from the two datasets

I am creating two variables to store the last 10 rows of each dataset to then export it to a new csv file.

Our Goal is to use those news for manual testing after we build Our model.

In [6]:
# Storing last 10 rows of df_fake into a variable
df_fake_manual_testing = df_fake.tail(10)

# Deleting the last 10 rows from dataset
df_fake.drop(df_fake.tail(10).index,
        inplace = True)

# Storing last 10 rows of df_true into a variable
df_true_manual_testing = df_true.tail(10)

# Deleting the last 10 rows from dataset
df_true.drop(df_true.tail(10).index,
        inplace = True)

# Dataset Shapes Now

In [7]:
# df_fake.shape, df_true.shape
df_true_manual_testing


Unnamed: 0,Tweets,class
693,Joe Rogan has been yammering about the COVID-1...,1
694,"An estimated 900,000 people have now died from...",1
695,The global death toll from Covid-19 surpassed ...,1
696,The Senate voted to end the Covid-19 emergency...,1
697,The CDC says most Americans can now take off t...,1
698,"New York City fired 1,430 municipal workers wh...",1
699,Blue states are dropping mask mandates and the...,1
700,California could be the first state to impose ...,1
701,"Eric Lander, the head of the Office of Science...",1
702,Mayors across the country sounded off in a new...,1


# Creating a Dataframe to store the 10 rows from df_true and df_fake

Also I am exporting that datafram in an CSV file out of this notebook.

In [8]:
df_manual_testing = pd.concat([df_fake_manual_testing, df_true_manual_testing], axis=0)
df_manual_testing = df_manual_testing.sample(frac=1)
df_manual_testing["Tweets"] = df_manual_testing["Tweets"].apply(word_drop)
df_manual_testing.head(3)

Unnamed: 0,Tweets,class
693,joe rogan has been yammering about the covid ...,1
1159,could the power of the sun slow the coronavirus,0
1156,will warm weather slow coronavirus,0


In [9]:
df_manual_testing.to_csv("Datasets/manual_testing.csv") 

# Feature Engineering 
## Mergin the Datasets into One

In [10]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.head(10)

Unnamed: 0,Tweets,class
0,Due to the recent outbreak for the Coronavirus...,0
5,CORONA UNMASKED: Chinese Intelligence Officer ...,0
9,Basic protective measures against the new coro...,0
14,Exposing yourself to the sun or to temperature...,0
15,You can recover from the coronavirus disease (...,0
16,Being able to hold your breath for 10 seconds ...,0
17,Drinking alcohol does not protect you against ...,0
18,COVID-19 virus can be transmitted in areas wit...,0
19,Cold weather and snow CANNOT kill the new coro...,0
20,Taking a hot bath does not prevent the new cor...,0


## Deleting Titlte, subject and date columns

In [11]:
# df = df_merge.drop(['title', 'subject', 'date'], axis=1)
# df.head(3)

## Randomizing rows in the Merged Dataset

In [12]:
df = df_merge.sample(frac=1)
df.head(10)

Unnamed: 0,Tweets,class
480,Bobby Kennedy Jr. Claims Dr. Fauci and Gates F...,0
34,"What's the ""new normal""? \n\nIn the Sunday edi...",1
1138,What is it?,0
408,"False claim: ""the Coronavirus"" is designed and...",0
55,"Aaron Rodgers has apologized to his ""loved one...",1
49,At least 5.2 million children globally have lo...,1
753,"The Coronavirus in America, the year ahead",0
423,'Coronavirus may have origins in China''s biol...,0
685,Should Grandma still come visit?,0
154,"THE INTRODUCTION OF 5G, DIGITAL MICROCHIPS AND...",0


# Checking for Null values

No null values as you can see in the results below

In [13]:
df.isnull().sum()

Tweets    0
class     0
dtype: int64

## Calling funtion on the text column

In [14]:
df["Tweets"] = df["Tweets"].apply(word_drop)


In [15]:
df.head(10)

Unnamed: 0,Tweets,class
480,bobby kennedy jr claims dr fauci and gates f...,0
34,what s the new normal in the sunday editi...,1
1138,what is it,0
408,false claim the coronavirus is designed and...,0
55,aaron rodgers has apologized to his loved one...,1
49,at least million children globally have lost...,1
753,the coronavirus in america the year ahead,0
423,coronavirus may have origins in china s biol...,0
685,should grandma still come visit,0
154,the introduction of digital microchips and e...,0


# Defining our Dependent and Independent Variables

In [16]:
x = df['Tweets']
y = df['class']

# Spliting the Data in Train and Test

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .25)

print('df', df.shape)
print('x_train', x_train.shape)
print('x_test', x_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

df (1765, 2)
x_train (1323,)
x_test (442,)
y_train (1323,)
y_test (442,)


# Converting Train data text into Vector
Using sklearn.feature_extraction.text.TfidfVectorizer
// from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorization = TfidfVectorizer()
x_train_vectors = vectorization.fit_transform(x_train)
x_test_vectors = vectorization.transform(x_test)

# Model Selection

## Training Logistic Regression Model

In [19]:
model_1 = LogisticRegression()
model_1.fit(x_train_vectors, y_train)


LogisticRegression()

## Testing the Model

In [20]:
y_predicted = model_1.predict(x_test_vectors)
accuracy = accuracy_score(y_test, y_predicted)
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_predicted) 

print("Accuracy = {:.2f}".format(accuracy))
print("Precision = ", precision)
print("Recall = ", recall)
print('F1-Score', f1)

Accuracy = 1.00
Precision =  [0.99642857 0.99382716]
Recall =  [0.99642857 0.99382716]
F1-Score [0.99642857 0.99382716]


In [21]:
df_manual = pd.read_csv('Datasets/manual_testing.csv')
df_manual = df_manual.drop(['Unnamed: 0'], axis=1)
print(df_manual.shape)
df_manual.head(10)

(20, 2)


Unnamed: 0,Tweets,class
0,joe rogan has been yammering about the covid ...,1
1,could the power of the sun slow the coronavirus,0
2,will warm weather slow coronavirus,0
3,did a mutation turbocharge the coronavirus no...,0
4,mayors across the country sounded off in a new...,1
5,the global death toll from covid surpassed m...,1
6,the cdc says most americans can now take off t...,1
7,why funding the covid response could be the b...,0
8,warmer weather may slow but not halt coronav...,0
9,an estimated people have now died from covid...,1


In [22]:
df_manual_vect = vectorization.transform(df_manual.loc[8, ['Tweets']]).todense()



In [24]:
manual_prediction = model_1.predict(np.asarray(df_manual_vect))

In [25]:
manual_prediction

array([0])