In [1]:
#https://www.kaggle.com/c/fake-news/data?select=train.csv
#Build a system to identify unreliable news articles


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
%matplotlib inline

In [2]:
df=pd.read_csv('fake_news.csv')

In [3]:
df.count()

id        20800
title     20242
author    18843
text      20761
label     20800
dtype: int64

In [4]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
df=df.dropna().reset_index()
df.count()

index     18285
id        18285
title     18285
author    18285
text      18285
label     18285
dtype: int64

In [6]:
from keras.models import Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [7]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [8]:
corpus=[]
for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]',' ',df['title'][i])
    review = review.lower()
    review = review.split()
    review = [token for token in review if token not in stopword_list]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
vocsize=5000
onehot_rep = [one_hot(words,vocsize)  for words in corpus]

In [10]:
onehot_rep[:2]

[[378, 3309, 991, 1805, 2861, 4607, 1740, 679, 1638, 3088],
 [4357, 4541, 4966, 864, 2503, 4939, 221]]

In [11]:
#Embedding representation

In [12]:
max_sent_len=20
embeded_seq = pad_sequences(onehot_rep,padding='pre',maxlen=max_sent_len)

In [13]:
embeded_seq.shape

(18285, 20)

In [15]:
vocsize,embeded_seq,max_sent_len

(5000, array([[   0,    0,    0, ...,  679, 1638, 3088],
        [   0,    0,    0, ..., 2503, 4939,  221],
        [   0,    0,    0, ..., 4424, 4660,  106],
        ...,
        [   0,    0,    0, ..., 2474, 2596, 1227],
        [   0,    0,    0, ...,  846, 3830, 1759],
        [   0,    0,    0, ...,  380, 2238, 4629]], dtype=int32), 20)

In [16]:
#creating Model
embeding_vec_fetures=40
model = Sequential()
model.add(Embedding(vocsize,embeding_vec_fetures,input_length=max_sent_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [17]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
import numpy as np
X_final = np.array(embeded_seq)
Y_final = np.array(df['label'])

In [19]:
X_final.shape , Y_final.shape

((18285, 20), (18285,))

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X_final,Y_final,test_size=.33 , random_state=42)

In [21]:
X_train.shape,Y_train.shape

((12250, 20), (12250,))

In [22]:
X_test.shape,Y_test.shape

((6035, 20), (6035,))

In [23]:
len(Y_train)

12250

In [24]:
#Model Training
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=3,batch_size=64)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 12250 samples, validate on 6035 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7ff6bada8bd0>

In [25]:
y_pred= model.predict_classes(X_test)

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [27]:
confusion_matrix(Y_test,y_pred)

array([[3104,  315],
       [ 211, 2405]])

In [28]:
accuracy_score(Y_test,y_pred)

0.9128417564208782