In [2]:
#importing libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/my-data/data.csv


In [48]:
#libraries used 

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional

In [5]:
# raw data is the data which is output from previous notebook (gathering fake news dataset)
data = pd.read_csv('../input/my-data/data.csv',index_col = 0)
print(data.shape)
data = data.reset_index(drop = True)
data.head()

(34324, 2)


Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Fake
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",True
2,Why the Truth Might Get You Fired,Fake
3,15 Civilians Killed In Single US Airstrike Hav...,Fake
4,Iranian woman jailed for fictional unpublished...,Fake


In [6]:
#encoding the label col
data['label'] = np.where(data['label'] == 'Fake',0,1)
# 0 - fake , 1 - true
data.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,0
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",1
2,Why the Truth Might Get You Fired,0
3,15 Civilians Killed In Single US Airstrike Hav...,0
4,Iranian woman jailed for fictional unpublished...,0


In [7]:
# lets see the value counts of the classes 
data['label'].value_counts() / len(data)

1    0.588014
0    0.411986
Name: label, dtype: float64

# Data Preprocessing

In [8]:
## delete it later
data['text'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [21]:
text = list(data['text'])
stop_words = set(stopwords.words('english'))

ps=PorterStemmer()
corpus=[]

from tqdm import tqdm 

for t in tqdm(text):
    result = re.sub('[^a-zA-Z]',' ',t)
    result = result.lower()
    result = result.split()
    result = [ps.stem(word) for word in result if not word in stop_words]
    result = ' '.join(result)
    corpus.append(result)

100%|██████████| 34324/34324 [00:10<00:00, 3139.95it/s]


In [29]:
vocab_size = 10000
onehot_rep = [one_hot(words,vocab_size) for words in corpus]
onehot_rep[:2] #observing first two elements

[[466, 7705, 369, 8395, 5265, 4342, 2209, 5858, 665, 7493],
 [6828, 963, 8019, 7935, 8620, 8123, 6647]]

In [41]:
#set a maximum length for sentences
smax_length= 20
#embedded representation
embedd = pad_sequences(onehot_rep,padding='pre',maxlen=smax_length)

In [52]:
#create a model
from tensorflow.keras.layers import Dropout
dims=40
bi_model=Sequential()
bi_model.add(Embedding(vocab_size,dims,input_length=smax_length))
bi_model.add(Dropout(0.3))
bi_model.add(Bidirectional(LSTM(100))) #lstm with 100 neurons
bi_model.add(Dropout(0.3))
bi_model.add(Dense(1,activation='sigmoid'))
bi_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(bi_model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 40)            400000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200)               112800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 201       
Total params: 513,001
Trainable params: 513,001
Non-trainable params: 0
_________________________________________________________________
None


In [53]:
#creating x and y 
y = np.array(data['label'])
X = np.array(embedd)

In [54]:
print(y.shape)
print(X.shape)

(34324,)
(34324, 20)


In [55]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [56]:
bi_model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f934016b450>

In [64]:
test_preds = bi_model.predict_classes(X_test)

In [66]:
from sklearn.metrics import accuracy_score,confusion_matrix

print('Accuracy is ',accuracy_score(y_test,test_preds))
print('')
print('Confusion matrix is ')
confusion_matrix(y_test,test_preds)

Accuracy is  0.8961934356185667

Confusion matrix is 


array([[3774,  505],
       [ 564, 5455]])

In [None]:
bi_model.save('my_model.h5')