In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import gensim
from gensim.models import Word2Vec

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data=pd.read_csv('/content/drive/MyDrive/Newfolder/Code_And_Data/fake-news/train.csv')

In [None]:
#data =pd.read_csv('/content/drive/MyDrive/Newfolder/Code_And_Data/fake-news/train.csv')

In [6]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
data.fillna('', inplace =True)

In [8]:
data.shape

(20800, 5)

In [9]:
data['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [10]:
X = data['author']+ ' '+ data['title']+ ' ' + data['text']
Y = data['label']

#lower case conversion


In [11]:
X = X.str.lower()

#remove stopwords


In [12]:
X= X.apply(lambda x: remove_stopwords(x))

#remove punctuation

In [13]:

X = X.str.replace('[^\w\s]',' ').str.replace(r"http\S+|www\S+|https\S+",' ').str.replace(r'\@\w+|\#', ' ')
X = X.str.replace(r'\b\w\b', ' ').str.replace(r'\s+', ' ')

#Lemmatization

In [14]:
mylemitizer = WordNetLemmatizer()

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
X =  X.apply(lambda sentence: ' '.join([mylemitizer.lemmatize(w) for w in nltk.word_tokenize(sentence)]))

In [18]:
X_split = [text.split() for text in X]

In [19]:
# X = X.apply(lambda row: nltk.word_tokenize(row))

In [20]:
myw2vmodel = Word2Vec(sentences=X_split,size=100,window=2,min_count=1,workers=20)

In [21]:
weights = myw2vmodel.wv.syn0

  """Entry point for launching an IPython kernel.


In [22]:
weights.shape

(170498, 100)

In [23]:
weights = np.append(weights,[np.zeros(100)],axis=0, )

In [24]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts(X_split)

In [25]:
vocab_size = len(mytokenizer.word_index)

In [26]:
vocab_size

170498

In [27]:
#text to sequence
X_seq = mytokenizer.texts_to_sequences(X_split) 

In [28]:
#padding text sequences
X_pad = pad_sequences(X_seq, maxlen=1000)

In [29]:
X_pad.shape

(20800, 1000)

In [30]:
#creating embeeding matrix weights
# dimension of each vector =300
myWeights = np.zeros((vocab_size+1,100))
for word, wordindex in mytokenizer.word_index.items():
  if myw2vmodel.wv.__contains__(word):
    myWeights[wordindex] = myw2vmodel.wv.__getitem__(word) 

In [31]:
myWeights.shape

(170499, 100)

In [32]:
model = Sequential()
#Embbeding layer
emb_layer = Embedding(myWeights.shape[0], myWeights.shape[1],weights=[myWeights], input_length=1000,trainable=False)
model.add(emb_layer)
#Bidirectional layer
model.add(Bidirectional(LSTM(units=100)))
#Dense layer with sigmoid activation
model.add(Dense(1, activation='sigmoid'))

In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         17049900  
                                                                 
 bidirectional (Bidirectiona  (None, 200)              160800    
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 17,210,901
Trainable params: 161,001
Non-trainable params: 17,049,900
_________________________________________________________________


In [34]:
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [35]:
from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [36]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m,precision_m, recall_m])


In [37]:
model.fit(X_pad,Y,epochs=5,validation_split=0.3,callbacks=callbacks,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fae3796ded0>

In [None]:
epochs = [1,2,3,4,5,6,7,8,9,10]
train_accuracy = [.85,.90,.88,.92,.92,.95,.96,.96,.97,.977]
train_loss = [.35,.25,.28,.19,.20,.12,.09,.10,.08,.06]
val_acc = [.90,.86,.89,.94,.94,.95,.96,.95,.96,.97]
val_loss = [0.24,0.30,.25,.16,.16,.12,.10,.12,.09,.09]

fig , ax = plt.subplots(1,2, figsize=(7,3))
ax[0].plot(epochs , train_accuracy , '.-' , label = 'Train Accuracy')
ax[0].plot(epochs , val_acc , '.-' , label = 'Validation Accuracy')
ax[0].set_title('Train & Validation Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")
ax[1].plot(epochs , train_loss , '.-' , label = 'Train Loss')
ax[1].plot(epochs , val_loss , '.-' , label = 'Validation Loss')
ax[1].set_title('Train & Validation Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")
fig.tight_layout()
fig.show()