# Fake News Detector Machine Learning Model 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_recall_fscore_support
import re
import string


# Importing Function from script.py file
from scripts import word_drop

# Loading The Datasets

For this exploratory Pipeline, I am using two separated datasets I found on Google, one CSV file is the **true news dataset** and the other file is the ****fake news dataset****.

In [3]:
df_fake = pd.read_csv('Datasets/Public_Data/corona_fake.csv')
df_fake = df_fake.drop(['text', 'source', 'label'], axis=1)
df_fake.rename(columns = {'title':'Tweets'}, inplace = True)
df_fake = df_fake.dropna()
print(df_fake.shape)
df_fake.head(3)


(1082, 1)


Unnamed: 0,Tweets
0,Due to the recent outbreak for the Coronavirus...
5,CORONA UNMASKED: Chinese Intelligence Officer ...
9,Basic protective measures against the new coro...


In [4]:
df_true = pd.read_csv('Datasets/Old_Collected_Data/COVID-19-Truth.csv')
df_true = df_true.drop(['Tweet URL', 'Tweet ID', 'User ID', 'Unnamed: 0'], axis=1)
print(df_true.shape)
df_true.head(3)

(703, 1)


Unnamed: 0,Tweets
0,The global Covid-19 death toll hits a new grim...
1,The White House unveiled a plan to move the na...
2,Britain's Queen Elizabeth II cancels virtual e...


# Removing Empty Rows

# Adding Lables to each Dataset

In [5]:
df_fake["class"] = 0
df_true["class"] = 1

# Dataset Shapes

In [6]:
df_fake.shape, df_true.shape

((1082, 2), (703, 2))

# Taking some rows from the two datasets

I am creating two variables to store the last 10 rows of each dataset to then export it to a new csv file.

Our Goal is to use those news for manual testing after we build Our model.

In [7]:
# Storing last 10 rows of df_fake into a variable
df_fake_manual_testing = df_fake.tail(10)

# Deleting the last 10 rows from dataset
df_fake.drop(df_fake.tail(10).index,
        inplace = True)

# Storing last 10 rows of df_true into a variable
df_true_manual_testing = df_true.tail(10)

# Deleting the last 10 rows from dataset
df_true.drop(df_true.tail(10).index,
        inplace = True)

# Dataset Shapes Now

In [8]:
# df_fake.shape, df_true.shape
df_true_manual_testing


Unnamed: 0,Tweets,class
693,Joe Rogan has been yammering about the COVID-1...,1
694,"An estimated 900,000 people have now died from...",1
695,The global death toll from Covid-19 surpassed ...,1
696,The Senate voted to end the Covid-19 emergency...,1
697,The CDC says most Americans can now take off t...,1
698,"New York City fired 1,430 municipal workers wh...",1
699,Blue states are dropping mask mandates and the...,1
700,California could be the first state to impose ...,1
701,"Eric Lander, the head of the Office of Science...",1
702,Mayors across the country sounded off in a new...,1


# Creating a Dataframe to store the 10 rows from df_true and df_fake

Also I am exporting that datafram in an CSV file out of this notebook.

In [9]:
df_manual_testing = pd.concat([df_fake_manual_testing, df_true_manual_testing], axis=0)
df_manual_testing = df_manual_testing.sample(frac=1)
df_manual_testing["Tweets"] = df_manual_testing["Tweets"].apply(word_drop)
df_manual_testing.head(3)

Unnamed: 0,Tweets,class
1154,did a mutation turbocharge the coronavirus no...,0
699,blue states are dropping mask mandates and the...,1
1158,summer is coming but the virus won t be going,0


In [10]:
df_manual_testing.to_csv("Datasets/manual_testing.csv") 

# Feature Engineering 
## Mergin the Datasets into One

In [11]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.head(10)

Unnamed: 0,Tweets,class
0,Due to the recent outbreak for the Coronavirus...,0
5,CORONA UNMASKED: Chinese Intelligence Officer ...,0
9,Basic protective measures against the new coro...,0
14,Exposing yourself to the sun or to temperature...,0
15,You can recover from the coronavirus disease (...,0
16,Being able to hold your breath for 10 seconds ...,0
17,Drinking alcohol does not protect you against ...,0
18,COVID-19 virus can be transmitted in areas wit...,0
19,Cold weather and snow CANNOT kill the new coro...,0
20,Taking a hot bath does not prevent the new cor...,0


## Deleting Titlte, subject and date columns

In [None]:
# df = df_merge.drop(['title', 'subject', 'date'], axis=1)
# df.head(3)

## Randomizing rows in the Merged Dataset

In [12]:
df = df_merge.sample(frac=1)
df.head(10)

Unnamed: 0,Tweets,class
82,The state of Texas has filed a lawsuit seeking...,1
533,"Perspective: “I survived covid-19, but my care...",1
462,HOW LONG CAN THE VIRUS THAT CAUSES COVID-19 LI...,0
272,Dr. Anthony Fauci said that the United States ...,1
197,Did China Steal Coronavirus From Canada And We...,0
258,“What are the odds?” – A timeline of facts lin...,0
261,Rush Limbaugh makes obvious point that Wuhan c...,0
347,The New York State health commissioner announc...,1
188,Pandemic Reveals Alarming Absence of Ethics in...,0
572,Plandemic,0


# Checking for Null values

No null values as you can see in the results below

In [13]:
df.isnull().sum()

Tweets    0
class     0
dtype: int64

# Creating a function that will take the text from the dataset and remove special characters

In [None]:
# def word_drop(text):
#     text = text.lower()
#     text = re.sub('\[.*?\]', '', text)
#     text = re.sub('\\W', " ", text)
#     text = re.sub('https?://\S+|www\.\S+', '', text)
#     text = re.sub('<.*?>+', '', text)
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#     text = re.sub('\n', '', text)
#     text = re.sub('\w*\d\w*', '', text)
#     return text

## Calling funtion on the text column

In [14]:
df["Tweets"] = df["Tweets"].apply(word_drop)


In [15]:
df.head(10)

Unnamed: 0,Tweets,class
82,the state of texas has filed a lawsuit seeking...,1
533,perspective i survived covid but my career...,1
462,how long can the virus that causes covid live...,0
272,dr anthony fauci said that the united states ...,1
197,did china steal coronavirus from canada and we...,0
258,what are the odds a timeline of facts lin...,0
261,rush limbaugh makes obvious point that wuhan c...,0
347,the new york state health commissioner announc...,1
188,pandemic reveals alarming absence of ethics in...,0
572,plandemic,0


# Defining our Dependent and Independent Variables

In [16]:
x = df['Tweets']
y = df['class']

# Spliting the Data in Train and Test

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .25)

print('df', df.shape)
print('x_train', x_train.shape)
print('x_test', x_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

df (1765, 2)
x_train (1323,)
x_test (442,)
y_train (1323,)
y_test (442,)


# Converting Train data text into Vector
Using sklearn.feature_extraction.text.TfidfVectorizer
// from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorization = TfidfVectorizer()
x_train_vectors = vectorization.fit_transform(x_train)
x_test_vectors = vectorization.transform(x_test)

# Model Selection

## Training Logistic Regression Model

In [19]:
model_1 = LogisticRegression()
model_1.fit(x_train_vectors, y_train)


LogisticRegression()

## Testing the Model

In [20]:
y_predicted = model_1.predict(x_test_vectors)
accuracy = accuracy_score(y_test, y_predicted)
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_predicted) 

print("Accuracy = {:.2f}".format(accuracy))
print("Precision = ", precision)
print("Recall = ", recall)
print('F1-Score', f1)

Accuracy = 0.99
Precision =  [0.99253731 0.99425287]
Recall =  [0.99625468 0.98857143]
F1-Score [0.99439252 0.99140401]


In [21]:
df_2 = pd.read_csv('Datasets/manual_testing.csv')
df_2 = df_2.drop(['Unnamed: 0'], axis=1)
print(df_2.shape)
df_2.head(10)

(20, 2)


Unnamed: 0,Tweets,class
0,did a mutation turbocharge the coronavirus no...,0
1,blue states are dropping mask mandates and the...,1
2,summer is coming but the virus won t be going,0
3,an estimated people have now died from covid...,1
4,summer heat may not diminish coronavirus strength,0
5,joe rogan has been yammering about the covid ...,1
6,will hot weather kill the coronavirus where yo...,0
7,new york city fired municipal workers who ei...,1
8,warmer weather may slow but not halt coronav...,0
9,mayors across the country sounded off in a new...,1


In [23]:
input_ = df_2
# input_
input_trans = vectorization.transform(input_)

In [36]:
a = vectorization.transform(df_2.loc[8, ['Tweets']]).todense()



In [37]:
b = model_1.predict(np.asarray(a))

In [38]:
b

array([0])