In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re, string

In [2]:
true = pd.read_csv("true.csv")
fake = pd.read_csv("fake.csv")

In [3]:
fake["class"] = 0
true["class"] = 1

In [4]:
fake.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [5]:
true.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [6]:
fake.shape, true.shape

((23481, 5), (21417, 5))

In [7]:
fake_manual_testing = fake.tail(10)
for i in range(23480, 23470, -1):
    fake.drop([i], axis=0, inplace=True)
true_manual_testing = true.tail(10)
for i in range(21416, 21406, -1):
    true.drop([i], axis=0, inplace=True)

In [8]:
manual_testing = pd.concat([fake_manual_testing, true_manual_testing], axis=0)
manual_testing.to_csv("News.csv")

In [9]:
news = pd.concat([fake, true], axis = 0)

In [10]:
news.drop(['title', 'subject', 'date'], axis=1, inplace=True)
news

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
21402,ON BOARD A U.S. MILITARY AIRCRAFT (Reuters) - ...,1
21403,WASHINGTON (Reuters) - The United States sugge...,1
21404,WASHINGTON (Reuters) - The United States has d...,1
21405,ISLAMABAD (Reuters) - Outlining a new strategy...,1


In [11]:
#shuffle dataframe
news = news.sample(frac=1)
news

Unnamed: 0,text,class
11662,BEIJING (Reuters) - Chinese tourists are still...,1
9205,(Reuters) - Legislation approved by Michigan l...,1
11800,(Reuters) - German Chancellor Angela Merkel s ...,1
20930,LONDON (Reuters) - U.S. President Donald Trump...,1
18887,DUBAI/PARIS (Reuters) - Saudi Arabia s decisio...,1
...,...,...
13846,The biggest thing a president can do for the p...,0
7428,Bernie Sanders has a knack for being able to l...,0
7944,DUBAI (Reuters) - Saudi Arabia said on Monday ...,1
13411,WASHINGTON (Reuters) - The U.S. State Departme...,1


In [12]:
news.isnull().sum()

text     0
class    0
dtype: int64

In [13]:
#removing special characters

def word_drop(text):
  text = text.lower()
  text = re.sub('\[.*?\]','', text)
  text = re.sub("\\W", " ", text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\w*\d\w*', '', text)
  return text

In [14]:
news["text"] = news["text"].apply(word_drop)

In [15]:
news

Unnamed: 0,text,class
11662,beijing reuters chinese tourists are still...,1
9205,reuters legislation approved by michigan l...,1
11800,reuters german chancellor angela merkel s ...,1
20930,london reuters u s president donald trump...,1
18887,dubai paris reuters saudi arabia s decisio...,1
...,...,...
13846,the biggest thing a president can do for the p...,0
7428,bernie sanders has a knack for being able to l...,0
7944,dubai reuters saudi arabia said on monday ...,1
13411,washington reuters the u s state departme...,1


In [16]:
x = news["text"]
y = news["class"]

In [17]:
# splitting into test and train

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .25)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

logisticRegression = LogisticRegression()
logisticRegression.fit(xv_train, y_train)

LogisticRegression()

In [20]:
logisticRegression.score(xv_test, y_test)

0.9861853832442068

In [21]:
pred_logisticRegression = logisticRegression.predict(xv_test)

In [22]:
print(classification_report(y_test, pred_logisticRegression))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5861
           1       0.98      0.99      0.99      5359

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



### Decision Tree Classifier

In [23]:
from sklearn.tree import DecisionTreeClassifier

decisionTree = DecisionTreeClassifier()
decisionTree.fit(xv_train, y_train)

DecisionTreeClassifier()

In [24]:
decisionTree.score(xv_test, y_test)

0.9951871657754011

In [25]:
pred_decisionTree = decisionTree.predict(xv_test)

In [26]:
print(classification_report(y_test, pred_decisionTree))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5861
           1       1.00      0.99      0.99      5359

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



### Gradient Boosting Classifier

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

gradientBoostingClassifier = GradientBoostingClassifier(random_state=0)
gradientBoostingClassifier.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [28]:
gradientBoostingClassifier.score(xv_test, y_test)

0.9949197860962566

In [29]:
pred_gradientBoostingClassifier = gradientBoostingClassifier.predict(xv_test)
print(classification_report(y_test, pred_gradientBoostingClassifier))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5861
           1       0.99      1.00      0.99      5359

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



### Random Forest Classifier

In [30]:
from sklearn.ensemble import RandomForestClassifier

randomForestClassifier = RandomForestClassifier(random_state=0)
randomForestClassifier.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [31]:
randomForestClassifier.score(xv_test, y_test)

0.9900178253119429

In [32]:
pred_randomForestClassifier=  randomForestClassifier.predict(xv_test)

In [33]:
print(classification_report(y_test, pred_randomForestClassifier))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5861
           1       0.99      0.99      0.99      5359

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



### Manual Testing

In [34]:
def output(n):
  if n == 0:
    return "Fake"
  elif n == 1:
    return "True"


def manual_testing(news):
  testing_news = {"text": [news]}
  new_def_test = pd.DataFrame(testing_news)
  new_def_test["text"] = new_def_test["text"].apply(word_drop)
  new_x_test = new_def_test["text"]
  new_xv_test = vectorization.transform(new_x_test)
  pred_logisticRegression = logisticRegression.predict(new_xv_test)
  pred_decisionTree = decisionTree.predict(new_xv_test)
  pred_gradientBoostingClassifier = gradientBoostingClassifier.predict(new_xv_test)
  pred_randomForestClassifier = randomForestClassifier.predict(new_xv_test)

  # return print(
  #     "\n\nLogistic Regression: {} \nDecision Tree Classifier: {} \nGradient Boosting Classifier: {} \nRandom Forest Classifier: {}".format(
  #       output(pred_logisticRegression),
  #       output(pred_decisionTree),
  #       output(pred_gradientBoostingClassifier),
  #       output(pred_randomForestClassifier)  
  #     ))

  return print(output(pred_decisionTree))

In [35]:
try:
  while 1:
    print("Hey there i am KITT, Beware of fake news and don't send it across!")  
    headline = str(input("Tell me a news: "))
    manual_testing(headline)
except KeyboardInterrupt:
  print("Stopped!")

Hey there i am KITT, Beware of fake news and don't send it across!
Fake
Hey there i am KITT, Beware of fake news and don't send it across!
Fake
Hey there i am KITT, Beware of fake news and don't send it across!
Fake
Hey there i am KITT, Beware of fake news and don't send it across!
Stopped!
