In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import re
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize,regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from textblob import TextBlob
import pickle

In [2]:
df = pd.read_csv('fake_news_datasets.csv')
df.head()

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,1
1,Did they post their votes for Hillary already?,1
2,"Now, most of the demonstrators gathered last ...",1
3,A dozen politically active pastors came here f...,0
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
df.shape

(20000, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    19989 non-null  object
 1   label   20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [5]:
df.isna().sum()

text     11
label     0
dtype: int64

In [6]:
df = df.fillna('')

In [7]:
df.isna().sum()

text     0
label    0
dtype: int64

In [8]:
port_stem = PorterStemmer()
stop = stopwords.words('english')

In [9]:
def stemming(content):
    con = re.sub('[^\w\s]','',content)
    con = con.lower()
    con = con.split()
    con = [port_stem.stem(word) for word in con if not word in stop]
    return ' '.join(con)

In [10]:
stemming('Hi this is chando')

'hi chando'

In [11]:
df['text'] = df['text'].apply(stemming)

In [12]:
df['text']

0        comment expect barack obama member fyf911 fuky...
1                                post vote hillari alreadi
2        demonstr gather last night exercis constitut p...
3        dozen polit activ pastor came privat dinner fr...
4        rs28 sarmat missil dub satan 2 replac ss18 fli...
                               ...                        
19995    frankfurt reuter german public prosecutor char...
19996    wacki conserv new insan attempt invent obama s...
19997    scott walker 2016 begin today speech freedomsu...
19998    washington reuter us immigr offici plan monthl...
19999    governor rick snyder emerg manag team liter po...
Name: text, Length: 20000, dtype: object

In [13]:
X = df['text']
y = df['label']

In [14]:
y.shape

(20000,)

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [16]:
vect = TfidfVectorizer()

In [17]:
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [18]:
X_train.shape,X_test.shape 

((16000, 137303), (4000, 137303))

In [19]:
dt = DecisionTreeClassifier()

In [20]:
dt.fit(X_train,y_train)

In [21]:
y_pred = dt.predict(X_test)
accuracy_score(y_test,y_pred)

0.904

In [22]:
confusion_matrix(y_test,y_pred)

array([[1735,  189],
       [ 195, 1881]])

In [23]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [24]:
y_pred_rf = rf.predict(X_test)
accuracy_score(y_test,y_pred_rf)

0.9065

In [25]:
y_pred_rf = rf.predict(X_test)
confusion_matrix(y_test,y_pred_rf)

array([[1742,  182],
       [ 192, 1884]])

In [26]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)

In [27]:
y_pred_rf = xgb.predict(X_test)
accuracy_score(y_test,y_pred_rf)

0.954

In [28]:
y_pred_rf = xgb.predict(X_test)
confusion_matrix(y_test,y_pred_rf)

array([[1797,  127],
       [  57, 2019]])

In [32]:
pickle.dump(vect,open("vector.pkl","wb"))

In [33]:
pickle.dump(xgb,open("model.pkl","wb"))

In [34]:
vector = pickle.load(open("vector.pkl","rb"))

In [35]:
model = pickle.load(open("model.pkl","rb"))

In [36]:
def fake_news(news):
    news = stemming(news)
    input_data = [news]
    vector_form = vector.transform(input_data)
    prediction = model.predict(vector_form)
    return prediction

In [37]:
fake_news("""comment expect barack obama member fyf911 fukyoflag blacklivesmatt movement call lynch hang white peopl cop encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl americaon fyoflag organ call sunshin radio blog show host texa call sunshin fing opinion radio show snapshot fyf911 lolatwhitefear twitter page 953 pm show urg support call fyf911 tonight continu dismantl illus white snapshot twitter radio call invit fyf911th radio show air 1000 pm eastern standard timedur show caller clearli call lynch kill white peoplea 239 minut clip radio show heard provid breitbart texa someon would like refer hannib alreadi receiv death threat result interrupt fyf911 confer callsan unidentifi black man said mother fker start fing like us bunch nier takin one us roll said caus alreadi roll gang anyway six seven black mother fcker see white person lynch ass let turn tabl conspir cop start lose peopl state emerg specul one two thing would happen bigass r war nier go start backin alreadi get kill fk got lose sunshin could heard say yep true fking true said need turn tabl kid get shot somebodi need becom sacrific sideh said everybodi st whatev like say everybodi differ posit war continu give fk anyway said might well util st turn tabl ner said way start lookin like havin mani casualti causal side instead kill black peopl black live matter mother fker got make matter find mother fker alon snap ass fin hang damn tree take pictur send mother fker need one exampl peopl start watchin turn tabl st said said start trickledown effect said one white person hung flathang start trickledown effect continu black peopl good start trend said get upperhand anoth black man spoke say need kill cop kill us first black male said best method right breitbart texa previous report sunshin upset racist white peopl infiltr disrupt one confer call subsequ releas phone number one infiltr veteran immedi start receiv threaten callson fyoflag movement support allegedli told veteran infiltr publicli post confer call go rape gut pregnant wife fing piec sht unborn creatur hung tree breitbart texa previous encount sunshin sandra bland protest waller counti jail texa said white peopl kill told journalist photograph see nappyass hair head mean one milit negro said protest redneck motherfk murder sandra bland nappi hair like fyf911 black radic say hold imperi power actual respons terrorist attack septemb 11th account day report breitbart texa sever websit twitter handl movement palmetto star describ one head organ said youtub video support burn symbol illus superior fals white supremaci like american flag british flag polic uniform ku klux klan hoodssierra mcgrone nocturnu libertu post help young afrikan clean rag oppress post two photo one appear photo black man wipe nake butt american flagfor entir stori breitbart news""")

array([0])

In [63]:
result = fake_news("""share twitter twelfth even better thirteenth time youv carv pumpkin jackolantern becom real drag fear trickshoot kirsten joy weiss herewith way brighten halloween daywhil make bit smokier well need 22 rifl pumpkin ammunit safe space away peopl especi liber worst favorit holiday gourd oh handi tip guid your look carv hideou grin pumpkin use bullet entri rather your go exit point get ghoulish appear that sure make impress trickortreat yeah bet theyll pick treat expert guid also recommend make game like halloween trickortr alway practic safeti first happi halloween""")

In [64]:
result

array([1])

In [65]:
if result == [0]:
    print('reliable')
else:
    print('unreliable')

unreliable
