In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 


In [108]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [109]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [110]:
#Data Pre-Processing 
#loading the dataset to a pandas dataframe

fake_news_dataset = pd.read_csv("Data/fake.csv")
true_news_dataset = pd.read_csv("Data/true.csv")


In [111]:
fake_news_dataset
true_news_dataset

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [112]:
fake_news_dataset['label']= 0
true_news_dataset['label']= 1

In [113]:
news_dataset = pd.concat([true_news_dataset , fake_news_dataset] , ignore_index = True)
news_dataset

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [114]:
news_dataset = news_dataset.sample(frac=1).reset_index(drop = True)
news_dataset.head()


Unnamed: 0,title,text,subject,date,label
0,Dumbass Alaska Lawmaker Claims Women Have Abo...,Generations upon generations of incest have ap...,News,"May 5, 2017",0
1,"North Korea deepening economic, diplomatic iso...",MEXICO CITY (Reuters) - U.S. Defense Secretary...,worldnews,"September 15, 2017",1
2,"Turkey, Iraq agree on opening second border ga...",ISTANBUL (Reuters) - Turkey and Iraq have agre...,worldnews,"October 26, 2017",1
3,Kenya vote in balance as crisis deepens after ...,NAIROBI (Reuters) - Kenya lurched deeper into ...,worldnews,"October 11, 2017",1
4,WATCH: Female Cop Halts Sex Offender’s Violen...,"Earlier in January, 31-year-old Michael Cox wa...",News,"February 1, 2016",0


In [115]:
news_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [116]:
news_dataset['content'] = news_dataset.title + " " + news_dataset.text

In [117]:
news_dataset = news_dataset[['content' , 'label']]
news_dataset.head()


Unnamed: 0,content,label
0,Dumbass Alaska Lawmaker Claims Women Have Abo...,0
1,"North Korea deepening economic, diplomatic iso...",1
2,"Turkey, Iraq agree on opening second border ga...",1
3,Kenya vote in balance as crisis deepens after ...,1
4,WATCH: Female Cop Halts Sex Offender’s Violen...,0


In [118]:
#Stemming

port_stem = PorterStemmer()

In [119]:

port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

def stemming(content):
    # Remove non-alphabet characters and lowercase
    content = re.sub('[^a-zA-Z]', ' ', content).lower().split()
    
    # Remove stopwords and stem
    content = [port_stem.stem(word) for word in content if word not in stop_words]
    
    return ' '.join(content)


In [120]:
from tqdm import tqdm
tqdm.pandas()

news_dataset['content'] = news_dataset['content'].progress_apply(stemming)


100%|██████████| 44898/44898 [09:46<00:00, 76.61it/s] 


In [123]:
news_dataset.to_csv('Data/stemmed_dataset.csv', index=False)


In [124]:
news_dataset = pd.read_csv('Data/stemmed_dataset.csv')


In [125]:
news_dataset

Unnamed: 0,content,label
0,dumbass alaska lawmak claim women abort free t...,0
1,north korea deepen econom diplomat isol matti ...,1
2,turkey iraq agre open second border gate turki...,1
3,kenya vote balanc crisi deepen odinga quit nai...,1
4,watch femal cop halt sex offend violent rampag...,0
...,...,...
44893,defi trump iran say boost missil capabl london...,1
44894,state file lawsuit challeng trump decis dreame...,1
44895,trump brag number peopl come inaugur realiz pr...,0
44896,trump consid democrat senat manchin energi sec...,1


In [127]:
#Seperating the data and label

X = news_dataset['content'].values
Y = news_dataset['label'].values
 

In [None]:
print(X)

['trump meet senat ernst amid vice presidenti specul trump meet senat ernst amid vice presidenti specul washington reuter presumpt republican presidenti nomine donald trump met u senat joni ernst iowa monday feed specul could short list consider vice presidenti run mate fox news report ernst told network new york real estat mogul good convers continu share insight donald need strengthen economi keep nation safe ensur america alway strong stabil forc around globe said trump tweet would meet ernst new jersey done great job senat iowa said trump met indiana governor mike penc also mention potenti run mate wife saturday republican sourc told reuter former hous repres speaker newt gingrich new jersey governor chri christi top trump vice presidenti short list name watch includ u senat bob corker tennesse jeff session alabama governor mari fallin oklahoma sourc said'
 'testi sheila jackson lee challeng trump budget director doctor director video testi sheila jackson lee challeng trump budget 

In [None]:
print(Y)

[1 0 1 ... 1 1 0]


In [None]:
X.shape

(44898,)

In [128]:
#Converting the textual data to numerical data

vectorizer =  TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [129]:
print(X)

  (0, 298)	0.034651144309092476
  (0, 334)	0.34113191854024966
  (0, 430)	0.08458216449198613
  (0, 660)	0.027080118767662347
  (0, 903)	0.04825731766235165
  (0, 1758)	0.1790091459178884
  (0, 2575)	0.02228586254325063
  (0, 2859)	0.0880302838301916
  (0, 3186)	0.02695617060694764
  (0, 3254)	0.03192350154408127
  (0, 3540)	0.03777104569839314
  (0, 4301)	0.024613859310058016
  (0, 4381)	0.04197793727644731
  (0, 4643)	0.027014233293551473
  (0, 5023)	0.04508562557559718
  (0, 5313)	0.050160427202307116
  (0, 5357)	0.022436971367573923
  (0, 5438)	0.08207392085591153
  (0, 7991)	0.03432985003769001
  (0, 9309)	0.05256655501612637
  (0, 9861)	0.044858219178672554
  (0, 10249)	0.033665881311533676
  (0, 11631)	0.03565892897030223
  (0, 11930)	0.03351197887808072
  (0, 12217)	0.05548421676897582
  :	:
  (44897, 9454)	0.14531419211008662
  (44897, 11723)	0.12283760595947181
  (44897, 12642)	0.20184298362727632
  (44897, 15406)	0.18363641024842925
  (44897, 16113)	0.13668339618620948
  (44

SPLITTING THE DATASET INTO TRAINING AND TESTING DATA

In [130]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

Training the Model : Logistic Regression 

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

Evaluation

In [None]:
#Accuracy Score On the training data

X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print("The Accuracy Score of the training data:" , training_data_accuracy)

The Accuracy Score of the training data: 0.9918703714015257


In [None]:
#Accuracy Score On the testing data

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print("The Accuracy Score of the test data:" , test_data_accuracy)

The Accuracy Score of the test data: 0.9880846325167038


Prediction

In [None]:
X_new = X_test[0]
prediction = model.predict(X_new)

if (prediction[0] == 0):
  print("The News is Fake.")
else:
  print("The news is Real.")

The news is Real.


In [None]:
import pickle

# Save the trained model
with open('model/fake_news_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the fitted TF-IDF vectorizer
with open('model/tfidf_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
