# Fake News Detector Machine Learning Model 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_recall_fscore_support
import re
import string

# Loading The Datasets

For this exploratory Pipeline, I am using two separated datasets I found on Google, one CSV file is the **true news dataset** and the other file is the ****fake news dataset****.

In [8]:
df_fake = pd.read_csv('Datasets/Public_Data/corona_fake.csv')
df_fake = df_fake.drop(['text', 'source', 'label'], axis=1)
df_fake.rename(columns = {'title':'Tweets'}, inplace = True)
df_fake = df_fake.dropna()
print(df_fake.shape)
df_fake.head(3)


(1082, 1)


Unnamed: 0,Tweets
0,Due to the recent outbreak for the Coronavirus...
5,CORONA UNMASKED: Chinese Intelligence Officer ...
9,Basic protective measures against the new coro...


In [9]:
df_true = pd.read_csv('Datasets/Old_Collected_Data/COVID-19-Truth.csv')
df_true = df_true.drop(['Tweet URL', 'Tweet ID', 'User ID', 'Unnamed: 0'], axis=1)
print(df_true.shape)
df_true.head(3)

(703, 1)


Unnamed: 0,Tweets
0,The global Covid-19 death toll hits a new grim...
1,The White House unveiled a plan to move the na...
2,Britain's Queen Elizabeth II cancels virtual e...


# Removing Empty Rows

# Adding Lables to each Dataset

In [10]:
df_fake["class"] = 0
df_true["class"] = 1

# Dataset Shapes

In [11]:
df_fake.shape, df_true.shape

((1082, 2), (703, 2))

# Taking some rows from the two datasets

I am creating two variables to store the last 10 rows of each dataset to then export it to a new csv file.

Our Goal is to use those news for manual testing after we build Our model.

In [12]:
# Storing last 10 rows of df_fake into a variable
df_fake_manual_testing = df_fake.tail(10)

# Deleting the last 10 rows from dataset
df_fake.drop(df_fake.tail(10).index,
        inplace = True)

# Storing last 10 rows of df_true into a variable
df_true_manual_testing = df_true.tail(10)

# Deleting the last 10 rows from dataset
df_true.drop(df_true.tail(10).index,
        inplace = True)

# Dataset Shapes Now

In [13]:
df_fake.shape, df_true.shape


((1072, 2), (693, 2))

# Creating a Dataframe to store the 10 rows from df_true and df_fake

Also I am exporting that datafram in an CSV file out of this notebook.

In [14]:
df_manual_testing = pd.concat([df_fake_manual_testing, df_true_manual_testing], axis=0)
df_manual_testing.to_csv("Datasets/manual_testing.csv") 

# Feature Engineering 
## Mergin the Datasets into One

In [15]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.head(10)

Unnamed: 0,Tweets,class
0,Due to the recent outbreak for the Coronavirus...,0
5,CORONA UNMASKED: Chinese Intelligence Officer ...,0
9,Basic protective measures against the new coro...,0
14,Exposing yourself to the sun or to temperature...,0
15,You can recover from the coronavirus disease (...,0
16,Being able to hold your breath for 10 seconds ...,0
17,Drinking alcohol does not protect you against ...,0
18,COVID-19 virus can be transmitted in areas wit...,0
19,Cold weather and snow CANNOT kill the new coro...,0
20,Taking a hot bath does not prevent the new cor...,0


## Deleting Titlte, subject and date columns

In [None]:
# df = df_merge.drop(['title', 'subject', 'date'], axis=1)
# df.head(3)

## Randomizing rows in the Merged Dataset

In [17]:
df = df_merge.sample(frac=1)
df.head(10)

Unnamed: 0,Tweets,class
425,Experts debunk fringe theory linking China’s c...,0
870,A Banana A Day Keeps The Coronavirus Away,0
890,The new coronavirus CANNOT be transmitted thro...,0
276,Multiple vaccine corporations are working on a...,0
597,"For two years, Hong Kong successfully insulate...",1
458,Will holding your breath for 10 seconds reveal...,0
81,"Most Americans are weary of Covid-19, recent p...",1
952,VITAMIN C AND ITS APPLICATION TO THE TREATMENT...,0
1036,Vitamin C can cure coronavirus,0
78,Pfizer/BioNTech's Covid-19 mRNA vaccine provid...,1


# Checking for Null values

No null values as you can see in the results below

In [18]:
df.isnull().sum()

Tweets    0
class     0
dtype: int64

# Creating a function that will take the text from the dataset and remove special characters

In [19]:
def word_drop(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\\W', " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## Calling funtion on the text column

In [21]:
df["Tweets"] = df["Tweets"].apply(word_drop)

In [22]:
df.head(10)

Unnamed: 0,Tweets,class
425,experts debunk fringe theory linking china s c...,0
870,a banana a day keeps the coronavirus away,0
890,the new coronavirus cannot be transmitted thro...,0
276,multiple vaccine corporations are working on a...,0
597,for two years hong kong successfully insulate...,1
458,will holding your breath for seconds reveal i...,0
81,most americans are weary of covid recent pol...,1
952,vitamin c and its application to the treatment...,0
1036,vitamin c can cure coronavirus,0
78,pfizer biontech s covid mrna vaccine provides...,1


# Defining our Dependent and Independent Variables

In [23]:
x = df['Tweets']
y = df['class']

# Spliting the Data in Train and Test

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .25)

print('df', df.shape)
print('x_train', x_train.shape)
print('x_test', x_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

df (1765, 2)
x_train (1323,)
x_test (442,)
y_train (1323,)
y_test (442,)


# Converting Train data text into Vector
Using sklearn.feature_extraction.text.TfidfVectorizer
// from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vectorization = TfidfVectorizer()
x_train_vectors = vectorization.fit_transform(x_train)
x_test_vectors = vectorization.transform(x_test)

# Model Selection

## Training Logistic Regression Model

In [26]:
model_1 = LogisticRegression()
model_1.fit(x_train_vectors, y_train)


LogisticRegression()

## Testing the Model

In [27]:
y_predicted = model_1.predict(x_test_vectors)
accuracy = accuracy_score(y_test, y_predicted)
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_predicted) 

print("Accuracy = {:.2f}".format(accuracy))
print("Precision = ", precision)
print("Recall = ", recall)
print('F1-Score', f1)

Accuracy = 0.99
Precision =  [0.99209486 0.99470899]
Recall =  [0.99603175 0.98947368]
F1-Score [0.99405941 0.99208443]


1091    use of herbal drugs to treat covid  should be ...
790     is a lost sense of smell a symptom of covid   ...
132                        how does the coronavirus work 
1135    the coronavirus isn t alive  that s why it s s...
185     here is everything you need to know to protect...
                              ...                        
247     as more and more countries ease covid restrict...
1060     coronavirus  deaths in italy might have been ...
1055    who received orders from  above  to declare a ...
560                                 how is covid  spread 
284     johnson  amp  johnson temporarily halted produ...
Name: Tweets, Length: 442, dtype: object

In [43]:
df_2 = pd.read_csv('Datasets/manual_testing.csv')
df_2 = df_2.drop(['Unnamed: 0', 'class'], axis=1)
print(df_2.shape)
df_2.head(2)

(20, 1)


Unnamed: 0,Tweets
0,Did a Mutation Turbocharge the Coronavirus? No...
1,Will Hot Weather Kill the Coronavirus Where Yo...


In [56]:
input_ = df_2['Tweets'][3]
# input_
input_trans = vectorization.fit_transform(input_)

ValueError: Iterable over raw text documents expected, string object received.