# Fake News Detector Machine Learning Model 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_recall_fscore_support
import re
import string

# Loading The Datasets

For this exploratory Pipeline, I am using two separated datasets I found on Google, one CSV file is the **true news dataset** and the other file is the ****fake news dataset****.

In [3]:
df_fake = pd.read_csv('Data_1/True.csv')
df_fake.head(3)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"


In [5]:
df_true = pd.read_csv('Data_1/Fake.csv')
df_true.head(3)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


# Adding Lables to each Dataset

In [6]:
df_fake["class"] = 0
df_true["class"] = 1

# Dataset Shapes

In [7]:
df_fake.shape, df_true.shape

((21417, 5), (23481, 5))

# Taking some rows from the two datasets

I am creating two variables to store the last 10 rows of each dataset to then export it to a new csv file.

Our Goal is to use those news for manual testing after we build Our model.

In [8]:
# Storing last 10 rows of df_fake into a variable
df_fake_manual_testing = df_fake.tail(10)

# Deleting the last 10 rows from dataset
df_fake.drop(df_fake.tail(10).index,
        inplace = True)

# Storing last 10 rows of df_true into a variable
df_true_manual_testing = df_true.tail(10)

# Deleting the last 10 rows from dataset
df_true.drop(df_true.tail(10).index,
        inplace = True)

# Dataset Shapes Now

In [9]:
df_fake.shape, df_true.shape


((21407, 5), (23471, 5))

# Creating a Dataframe to store the 10 rows from df_true and df_fake

Also I am exporting that datafram in an CSV file out of this notebook.

In [10]:
df_manual_testing = pd.concat([df_fake_manual_testing, df_true_manual_testing], axis=0)
df_manual_testing.to_csv("manual_testing.csv") 

# Feature Engineering 
## Mergin the Datasets into One

In [11]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.head(10)

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017",0
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017",0
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017",0
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017",0
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017",0


## Deleting Titlte, subject and date columns

In [12]:
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.head(3)

Unnamed: 0,text,class
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0


## Randomizing rows in the Merged Dataset

In [13]:
df = df.sample(frac=1)
df.head(10)

Unnamed: 0,text,class
991,WASHINGTON (Reuters) - The United States wants...,0
6201,Ever since Target decided to stand up to trans...,1
8654,Can you imagine if Republicans interrogated ev...,1
19622,Portland rioters have been chasing a Trump sup...,1
11514,The cash flows through the State Department an...,1
13499,NAIROBI (Reuters) - The U.S. military did not ...,0
10590,Fox News Channel s Jeanine Pirro went after th...,1
5715,WASHINGTON (Reuters) - The U.S. Congress moved...,0
13118,,1
17387,BISHKEK (Reuters) - Kyrgyz opposition leader O...,0


# Checking for Null values

No null values as you can see in the results below

In [14]:
df.isnull().sum()

text     0
class    0
dtype: int64

# Creating a function that will take the text from the dataset and remove special characters

In [15]:
def word_drop(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\\W', " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## Calling funtion on the text column

In [16]:
df["text"] = df["text"].apply(word_drop)

In [17]:
df.head(10)

Unnamed: 0,text,class
991,washington reuters the united states wants...,0
6201,ever since target decided to stand up to trans...,1
8654,can you imagine if republicans interrogated ev...,1
19622,portland rioters have been chasing a trump sup...,1
11514,the cash flows through the state department an...,1
13499,nairobi reuters the u s military did not ...,0
10590,fox news channel s jeanine pirro went after th...,1
5715,washington reuters the u s congress moved...,0
13118,,1
17387,bishkek reuters kyrgyz opposition leader o...,0


# Defining our Dependent and Independent Variables

In [18]:
x = df['text']
y = df['class']

# Spliting the Data in Train and Test

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .25)

print('df', df.shape)
print('x_train', x_train.shape)
print('x_test', x_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

df (44878, 2)
x_train (33658,)
x_test (11220,)
y_train (33658,)
y_test (11220,)


# Converting Train data text into Vector
Using sklearn.feature_extraction.text.TfidfVectorizer
// from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorization = TfidfVectorizer()
x_train_vectors = vectorization.fit_transform(x_train)
x_test_vectors = vectorization.transform(x_test)

# Model Selection

## Training Logistic Regression Model

In [21]:
model_1 = LogisticRegression()
model_1.fit(x_train_vectors, y_train)


LogisticRegression()

## Testing the Model

In [22]:
y_predicted = model_1.predict(x_test_vectors)
accuracy = accuracy_score(y_test, y_predicted)
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_predicted) 

print("Accuracy = {:.2f}".format(accuracy))
print("Precision = ", precision)
print("Recall = ", recall)
print('F1-Score', f1)

Accuracy = 0.99
Precision =  [0.98336798 0.99072356]
Recall =  [0.98953975 0.98523985]
F1-Score [0.98644421 0.9879741 ]
