In [None]:
# import statements
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import re
import nltk # natural language toolkit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords # less semantic meaning while training (ex: it)
from nltk.stem import WordNetLemmatizer # stem - root words
# lemmatize - go to canonical/dictionary form of word
# all libraries used for text pre-processing

#User input passing
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading & Labeling the Datasets

In [None]:
# import dataset from Kaggle
df_fake = pd.read_csv('/content/drive/MyDrive/ai_project/Fake.csv')
df_true = pd.read_csv('/content/drive/MyDrive/ai_project/True.csv')

In [None]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
#Adding a Label Column
df_fake['label'] = 0
df_true['label'] = 1

#Combining the Datasets
df = pd.concat([df_fake, df_true]).reset_index(drop = True)

In [None]:
#Dropping the Subject and Date Columns
df = df.drop(['subject', 'date'], axis = 1)
df

Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0
...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [None]:
df.isnull().sum()
#df = df.dropna()
#df = df.reset_index(drop = True)
#df

title    0
text     0
label    0
dtype: int64

In [None]:
df.duplicated().sum()
df = df.drop_duplicates()
df = df.reset_index(drop = True)
df

Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0
...,...,...,...
39100,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
39101,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",1
39102,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,1
39103,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,1


Data Preprocessing

In [None]:
#Text Preprocessing Function
def process_text(text):

    text = re.sub(r'\s+',' ',text,flags=re.I)
    # remove special characters other than A-Z, a-z, 0-9, or "_"
    text = re.sub(r'\W',' ',text)

    # remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+',' ',text)

    # remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]','',text)

    # convert to lower case
    text=text.lower()

    # returns list of tokens of words
    words = word_tokenize(text)

    # convert to dictionary format (root words)
    l = WordNetLemmatizer()
    words = [l.lemmatize(w) for w in words]

    stop = set(stopwords.words('english'))
    words2 = [word for word in words if word not in stop]
    # to preserve articles, prepositions (and, the, it, of, etc.)

    words2 = [word for word in words if len(word)>2]

    # removing duplicate words
    indices = np.unique(words2, return_index=True)[1]
    cleaned_text = np.array(words2)[np.sort(indices)].tolist()
    cleaned_text = ' '.join(cleaned_text)

    return cleaned_text

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df['text'] = df['text'].apply(process_text)

In [None]:
# Train-Test Split

X = df['text'].values
y = df['label'].values

In [None]:
X

array(['donald trump just couldn wish all american happy new year and leave that instead had give shout out his enemy hater the very dishonest fake news medium former reality show star one job our country rapidly grows stronger smarter want friend supporter even healthy president angry pant tweeted will great for america realdonaldtrump december tweet went down about welll you expect what kind sends greeting like this despicable petty infantile gibberish only lack decency won allow him rise above gutter long enough citizen bishop talbert swan talbertswan calvin calvinstowell your impeachment would make but also accept regaining control congress miranda yaver mirandayaver hear yourself talk when have include many people hate wonder why they alan sandoval alansandoval who word marlene can say koren pollitt korencarpenter here eve from including those fought lost badly don know love nothing been doing directed message easter thanksgiving anniversary pic twitter com fpaekypa daniel dale dd

In [None]:
# convert text to numerical values - vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
print(X_train)

  (0, 104650)	0.04453306144968803
  (0, 104612)	0.029430067089133816
  (0, 103004)	0.023121097190929193
  (0, 102533)	0.050315505905676836
  (0, 102512)	0.01749744676239836
  (0, 101932)	0.021068320416162717
  (0, 101787)	0.0316794189994909
  (0, 101781)	0.024491187750852725
  (0, 101759)	0.034913448890700235
  (0, 101753)	0.026373289511212816
  (0, 101706)	0.02713953378562029
  (0, 101494)	0.025894839522530726
  (0, 101412)	0.03602739802345902
  (0, 101400)	0.07015526361287588
  (0, 101259)	0.032473089924278345
  (0, 101151)	0.06798488444399092
  (0, 100925)	0.05955816248148324
  (0, 100681)	0.0346451472438252
  (0, 100493)	0.0871508415262736
  (0, 100491)	0.06568408642420352
  (0, 99764)	0.07396286917913668
  (0, 99763)	0.07167490279294388
  (0, 99760)	0.05303187542147304
  (0, 99510)	0.07876396999916274
  (0, 99453)	0.04334365998050893
  :	:
  (31283, 5942)	0.04092404199540158
  (31283, 5696)	0.05922403344299041
  (31283, 4910)	0.03529845690671547
  (31283, 4670)	0.04192441522452822

In [None]:
# Logisitc Regression model
# Binary Classification model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# predictions on training set
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_train)
print(accuracy_score(y_pred,y_train))

0.9942782252908835


In [None]:
# predictions on testing set
y_pred_test = model.predict(X_test)
print(accuracy_score(y_pred_test, y_test))

0.98989898989899


In [None]:
def inference(text):
  text = process_text(text)
  text = vector.transform([text])
  num = model.predict(text)
  if num == 0:
    return "Fake News"
  else:
    return "Real News"

In [None]:
text = input("Enter text from a 2016 election news article: ")
print(inference(text))

Enter text from a 2016 election news article: mn,n,mn
Fake News
