## Detecting fake news with machine learning

In [1]:
conda update --all

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/katie/opt/anaconda3


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    anaconda-navigator-1.10.0  |           py37_0         6.1 MB
    argon2-cffi-20.1.0         |   py37h9ed2024_1          44 KB
    backcall-0.2.0             |     pyhd3eb1b0_0          13 KB
    backports.tempfile-1.0     |     pyhd3eb1b0_1          11 KB
    beautifulsoup4-4.9.3       |     pyha847dfd_0          86 KB
    bleach-3.3.0               |     pyhd3eb1b0_0         113 KB
    blosc-1.21.0               |       h2842e9f_0          49 KB
    boto3-1.17.11              |     pyhd3eb1b0_0          70 KB
    botocore-1.20.12           |     pyhd3eb1b0_1         3.5 MB
    brunsli-0.1                |       h23ab428_0         142 KB
    cctools-927.0.2            |       h5ba7a2e_

ld64-450.3           | 858 KB    | ##################################### | 100% 
psutil-5.8.0         | 336 KB    | ##################################### | 100% 
kiwisolver-1.3.1     | 52 KB     | ##################################### | 100% 
nest-asyncio-1.5.1   | 10 KB     | ##################################### | 100% 
bleach-3.3.0         | 113 KB    | ##################################### | 100% 
openssl-1.1.1j       | 2.2 MB    | ##################################### | 100% 
packaging-20.9       | 37 KB     | ##################################### | 100% 
setuptools-52.0.0    | 721 KB    | ##################################### | 100% 
parsel-1.5.2         | 262 KB    | ##################################### | 100% 
urllib3-1.26.3       | 105 KB    | ##################################### | 100% 
tifffile-2021.1.14   | 126 KB    | ##################################### | 100% 
conda-build-3.21.4   | 545 KB    | ##################################### | 100% 
nbformat-5.1.2       | 68 KB

In [2]:
#import libraries
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
#load datasets
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

In [4]:
#initial exploration
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
df_fake['class'] = 0
df_true['class'] = 1

In [7]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [8]:
df_fake_manual_testing = df_fake.tail(10)
for i in range(23480, 23470, -1):
    df_fake.drop([i], axis=0, inplace=True)
    
df_true_manual_testing = df_true.tail(10)
for i in range(21416, 21406, -1):
    df_true.drop([i], axis=0, inplace=True)

In [9]:
#combine dfs in single df
df_manual_testing = pd.concat([df_fake_manual_testing, df_true_manual_testing], axis=0)

In [10]:
#save as csv
df_manual_testing.to_csv('manual_testing.csv')

In [11]:
#merge original dfs
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [12]:
#drop unnecessary columns
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.head()

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [13]:
#shuffle df
df = df.sample(frac=1)
df.head()

Unnamed: 0,text,class
15394,Career criminals professional grifters the ult...,0
19488,"PARIS (Reuters) - Marine Le Pen, the leader of...",1
5062,"In a continuation of the DNC hack, Democratic ...",0
18149,Pro-Trump supporters have known the truth for ...,0
11825,ISTANBUL (Reuters) - Turkish President Tayyip ...,1


In [14]:
df.reset_index(inplace = True)
df.drop(['index'], axis = 1, inplace = True)

In [15]:
#check for null values
df.isnull().sum()

text     0
class    0
dtype: int64

In [16]:
#remove special characters/punctuation
def word_drop(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [17]:
df['text'] = df['text'].apply(word_drop)

In [18]:
df.head(10)

Unnamed: 0,text,class
0,career criminals professional grifters the ult...,0
1,paris reuters marine le pen the leader of...,1
2,in a continuation of the dnc hack democratic ...,0
3,pro trump supporters have known the truth for ...,0
4,istanbul reuters turkish president tayyip ...,1
5,what is it that they don t get does anyone ev...,0
6,washington reuters the presidents of the u...,1
7,washington reuters a revised criminal just...,1
8,paul ryan nervously giggled during the republi...,0
9,aden reuters security forces in the southe...,1


In [19]:
#define independent and dependent variables
x = df['text']
y = df['class']

In [20]:
#split dataset into train/test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [21]:
#vectorize x variable
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [23]:
#apply logistic regression classification
from sklearn.linear_model import LogisticRegression

In [24]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [25]:
pred_lr = LR.predict(xv_test)

In [26]:
LR.score(xv_test, y_test)

0.9871657754010695

In [27]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5902
           1       0.98      0.99      0.99      5318

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [28]:
#apply decision tree classification
from sklearn.tree import DecisionTreeClassifier

In [29]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [30]:
pred_dt = DT.predict(xv_test)

In [31]:
DT.score(xv_test, y_test)

0.9952762923351158

In [32]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5902
           1       1.00      0.99      1.00      5318

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [33]:
#apply gradient boosting classification
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [35]:
pred_gbc = GBC.predict(xv_test)

In [36]:
GBC.score(xv_test, y_test)

0.9945632798573975

In [37]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      5902
           1       0.99      1.00      0.99      5318

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [38]:
#apply random forest classification
from sklearn.ensemble import RandomForestClassifier

In [39]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [40]:
pred_rfc = RFC.predict(xv_test)

In [41]:
RFC.score(xv_test, y_test)

0.9903743315508021

In [42]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5902
           1       0.99      0.99      0.99      5318

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [43]:
#model testing with the manual dataset
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(word_drop) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(output_lable(pred_LR[0]), 
                                                                                                              output_lable(pred_DT[0]), 
                                                                                                              output_lable(pred_GBC[0]), 
                                                                                                              output_lable(pred_RFC[0])))

In [None]:
#input text from csv file to blank
news = str(input())
manual_testing(news)