In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [2]:
#Import data sets
f_data = pd.read_csv("Fake.csv")
t_data = pd.read_csv("True.csv")

In [3]:
#Insert type identifier columns on data sets
f_data["type"] = 0
t_data["type"] = 1

In [4]:
#Check if type was properly added
f_data.head()

Unnamed: 0,title,text,subject,date,type
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [5]:
t_data.head()

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [6]:
#Check number of lines&columns on each data set
f_data.shape, t_data.shape

((23481, 5), (21417, 5))

In [7]:
#Move last 10 entries for manual testing
f_data_manual_testing = f_data.tail(10)
for i in range(23480, 23470, -1):
    f_data.drop([i], axis = 0, inplace = True)

t_data_manual_testing = t_data.tail(10)
for i in range(21416, 21406, -1):
    t_data.drop([i], axis = 0, inplace = True)

In [8]:
#Check if last 10 entries were moved
f_data.shape, t_data.shape

((23471, 5), (21407, 5))

In [9]:
f_data_manual_testing

Unnamed: 0,title,text,subject,date,type
23471,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0
23472,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,Middle-east,"January 19, 2016",0
23473,Astroturfing: Journalist Reveals Brainwashing ...,Vic Bishop Waking TimesOur reality is carefull...,Middle-east,"January 19, 2016",0
23474,The New American Century: An Era of Fraud,Paul Craig RobertsIn the last years of the 20t...,Middle-east,"January 19, 2016",0
23475,Hillary Clinton: ‘Israel First’ (and no peace ...,Robert Fantina CounterpunchAlthough the United...,Middle-east,"January 18, 2016",0
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",0


In [10]:
t_data_manual_testing

Unnamed: 0,title,text,subject,date,type
21407,"Mata Pires, owner of embattled Brazil builder ...","SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",worldnews,"August 22, 2017",1
21408,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21409,"U.S., North Korea clash at U.N. arms forum on ...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21410,Headless torso could belong to submarine journ...,COPENHAGEN (Reuters) - Danish police said on T...,worldnews,"August 22, 2017",1
21411,North Korea shipments to Syria chemical arms a...,UNITED NATIONS (Reuters) - Two North Korean sh...,worldnews,"August 21, 2017",1
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [11]:
#Merge fake and true data sets
merged_data = pd.concat([f_data, t_data], axis = 0)

#Check if merge was done
merged_data.shape

(44878, 5)

In [12]:
#Remove unneeded columns
data = merged_data.drop(['title', 'subject', 'date'], axis = 1)

In [13]:
#Check if null values exist
data.isnull().sum()

text    0
type    0
dtype: int64

In [14]:
#Shuffle the rows of the data
data = data.sample(frac = 1)

#Check if shuffle was done
data.head()

Unnamed: 0,text,type
11219,Zinke and Brandyburg accompanied Spicer to the...,0
302,(Reuters) - U.S Democratic Senator Al Franken ...,1
5601,WASHINGTON (Reuters) - Homeland Security Secre...,1
5899,BEIRUT (Reuters) - A Syrian Christian Orthodox...,1
3609,WASHINGTON (Reuters) - The leaders of the U.S....,1


In [15]:
#Reset index of the data and remove the old index values
data.reset_index(inplace = True)
data.drop(['index'], axis = 1, inplace = True)

In [16]:
#Check if reset of index was done
data.head()

Unnamed: 0,text,type
0,Zinke and Brandyburg accompanied Spicer to the...,0
1,(Reuters) - U.S Democratic Senator Al Franken ...,1
2,WASHINGTON (Reuters) - Homeland Security Secre...,1
3,BEIRUT (Reuters) - A Syrian Christian Orthodox...,1
4,WASHINGTON (Reuters) - The leaders of the U.S....,1


In [17]:
#Create function to clean and normalize text
def normalize(text):
    #Convert all characters in the text to lowercase
    text = text.lower()
    #Remove square brackets and their contents from the text
    text = re.sub('\[.*?\]', '', text)
    #Replace any non-alphanumeric characters (excluding underscore) with a space
    text = re.sub("\\W", " ", text)
    #Remove URLs starting with "http://" or "https://" or "www." from the text
    text = re.sub('https?://\S+|www\.\S+', '', text)
    #Remove HTML tags from the text
    text = re.sub('<.*?>+', '', text)
    #Remove punctuation marks
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #Remove newline characters from the text
    text = re.sub('\n', '', text)
    #Remove any word containing digits from the text
    text = re.sub('\w*\d\w*', '', text)
    return text

In [18]:
#Apply normalize function to each element in the text column
data['text'] = data['text'].apply(normalize)

In [19]:
#Check if normalization was done
data.head()

Unnamed: 0,text,type
0,zinke and brandyburg accompanied spicer to the...,0
1,reuters u s democratic senator al franken ...,1
2,washington reuters homeland security secre...,1
3,beirut reuters a syrian christian orthodox...,1
4,washington reuters the leaders of the u s ...,1


In [20]:
#Define dependent and independent variables
x = data['text']
y = data['type']

In [21]:
#Split the input data (x) and target labels (y) into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

In [22]:
#Vectorize textual data into numerical features
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [23]:
#Train the model to learn relationship between training data and target labels
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(xv_train, y_train)

In [24]:
#Generate predictions from our trained LogisticRegression model
pred_lr = lr.predict(xv_test)

#Check accuracy score from the model
lr.score(xv_test, y_test)

0.9862745098039216

In [25]:
#Print classification report that summarizes the performance of the classifier on the test data
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5845
           1       0.98      0.99      0.99      5375

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [26]:
#Train another model
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(xv_train, y_train)

In [27]:
#Generate predictions from our trained DecisionTreeClassifier model
pred_dt = dt.predict(xv_test)

#Check accuracy score from the model
dt.score(xv_test, y_test)

0.9967023172905526

In [28]:
#Print classification report that summarizes the performance of the classifier on the test data
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5845
           1       1.00      1.00      1.00      5375

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [29]:
#Train another model
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state = 0)
gb.fit(xv_train, y_train)

In [30]:
#Generate predictions from our trained GradientBoostingClassifier model
pred_gb = gb.predict(xv_test)

#Check accuracy score from the model
gb.score(xv_test, y_test)

0.9958110516934047

In [31]:
#Print classification report that summarizes the performance of the classifier on the test data
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5845
           1       0.99      1.00      1.00      5375

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [32]:
#Train another model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 0)
rf.fit(xv_train, y_train)

In [33]:
#Generate predictions from our trained RandomForestClassifier model
pred_rf = rf.predict(xv_test)

#Check accuracy score from the model
rf.score(xv_test, y_test)

0.9861853832442068

In [34]:
#Print classification report that summarizes the performance of the classifier on the test data
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5845
           1       0.99      0.98      0.99      5375

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [35]:
#Funtions for prediction
def result(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Real News"

def test(news):
    test_news = {"text":[news]}
    new_def_test = pd.DataFrame(test_news)
    new_def_test["text"] = new_def_test["text"].apply(normalize)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_lr = lr.predict(new_xv_test)
    pred_dt = dt.predict(new_xv_test)
    pred_gb = gb.predict(new_xv_test)
    pred_rf = rf.predict(new_xv_test)
    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGB Prediction: {} \nRF Prediction: {}".format(result(pred_lr[0]), result(pred_dt[0]), result(pred_gb[0]), result(pred_rf[0])))

In [38]:
#News Input for prediction
news = str(input())
test(news)

 Former President Donald Trump said in a new interview with Time magazine that he doesn't think there will be political violence around the 2024 election because he believes he'll win — but that it "always depends on the fairness of an election."  The comments came along with a statement that Trump would "consider" pardoning every person who has been charged or convicted for rioting at the Capitol on Jan. 6, 2021, after the then-president rallied his followers against what he has repeatedly and baselessly called a "rigged" election.  Trump also answered questions digging into his campaign position on abortion policy being left up to the states — and deflecting questions pressing him on any potential federal action, including his position on whether abortion medication should be available. And Trump reinforced past statements he has made on Russia doing “whatever the hell they want” to NATO countries who don’t pay their “fair share” and the extent of a military crackdown he plans to ord



LR Prediction: Real News 
DT Prediction: Fake News 
GB Prediction: Fake News 
RF Prediction: Fake News
