In [21]:
import pandas as pd
import re
from nltk.corpus import stopwords 
from keras.models import Sequential
from keras.layers import LSTM,Embedding,Dropout,Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from sklearn.metrics import classification_report

In [23]:
fake_news = pd.read_csv("train.csv")
fake_news.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [24]:
fake_news.shape

(20800, 5)

In [25]:
fake_news.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [26]:
fake_news[['title','author']] = fake_news[['title','author']].fillna(value="No Data")
fake_news = fake_news.dropna()

In [27]:
fake_news.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [28]:
fake_news.shape

(20761, 5)

In [29]:
predictor = "text"
target = "label"

In [30]:
length = []
[length.append(len(str(text))) for text in fake_news[predictor]]
fake_news['length'] = length
fake_news.head()

Unnamed: 0,id,title,author,text,label,length
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,4930
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,4160
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,7692
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,3237
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,938


In [31]:
min(fake_news['length']), max(fake_news['length']), round(sum(fake_news['length'])/len(fake_news['length']))

(1, 142961, 4553)

In [32]:
fake_news[predictor][fake_news['length'] < 50]

82                                                   
169                                                  
173                                   Guest   Guest  
196            They got the heater turned up on high.
295                                                  
                             ...                     
20350                         I hope nobody got hurt!
20418                                 Guest   Guest  
20431    \nOctober 28, 2016 The Mothers by stclair by
20513                                                
20636                              Trump all the way!
Name: text, Length: 207, dtype: object

In [33]:
fake_news = fake_news[fake_news['length'] > 50]

In [34]:
fake_news.shape

(20551, 6)

In [35]:
min(fake_news['length']), max(fake_news['length']), round(sum(fake_news['length'])/len(fake_news['length']))

(51, 142961, 4599)

# LSTM Model

In [36]:
MAX_FEATURES = 4550

In [37]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [38]:
fake_news[predictor] = fake_news[predictor].apply(lambda x : clean_text(x))

In [39]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts = fake_news[predictor])
X = tokenizer.texts_to_sequences(texts = fake_news[predictor])

In [40]:
len(X)

20551

In [41]:
X = pad_sequences(sequences = X, maxlen = MAX_FEATURES, padding = 'pre')
X.shape

(20551, 4550)

In [42]:
y = fake_news[target].values
print(y.shape)
Y_modified = np_utils.to_categorical(y)
Y_modified.shape

(20551,)


(20551, 2)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16440, 4550)
(4111, 4550)
(16440,)
(4111,)


In [23]:
X_train

array([[    0,     0,     0, ...,  1833,  3718,  3477],
       [    0,     0,     0, ...,  1591,   775, 17679],
       [    0,     0,     0, ...,   633,    20,   132],
       ...,
       [    0,     0,     0, ..., 10804,   297,  1181],
       [    0,     0,     0, ...,     1, 22035, 13528],
       [    0,     0,     0, ...,   164,   948,   406]])

In [36]:
model = Sequential()
model.add(Embedding(input_dim =len(tokenizer.word_counts) +1, output_dim = 100))
model.add(LSTM(100, activation='tanh',return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100,activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(len(set(y)), activation='sigmoid'))
#opt_adam = optimizers.Adadelta()  #adam(learning_rate=0.01)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [37]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 100)         19606900  
_________________________________________________________________
lstm_9 (LSTM)                (None, None, 100)         80400     
_________________________________________________________________
dropout_9 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
Total params: 19,767,902
Trainable params: 19,767,902
Non-trainable params: 0
__________________________________________

In [38]:
model.fit(X_train, y_train, epochs = 1,batch_size=50,validation_data =(X_test,y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16440 samples, validate on 4111 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x188d7998f28>

In [10]:
import pickle

model = pickle.load(open("fake_news.sav","rb"))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [44]:
X_prediction = model.predict_classes(X_test)

In [46]:
print(classification_report(y_test,X_prediction ,digits=3))

              precision    recall  f1-score   support

           0      0.983     0.741     0.845      2082
           1      0.788     0.987     0.876      2029

    accuracy                          0.863      4111
   macro avg      0.886     0.864     0.861      4111
weighted avg      0.887     0.863     0.861      4111



# Generating test predictions

In [11]:
test_data = pd.read_csv("test.csv")

In [12]:
test_data.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [13]:
test_data[['title','author']] = test_data[['title','author']].fillna(value="No Data")
test_data = test_data.dropna()
test_data[predictor] = test_data[predictor].apply(lambda x : clean_text(x))

In [14]:
t_tokenizer = Tokenizer()
t_tokenizer.fit_on_texts(texts = test_data[predictor])
X_test = t_tokenizer.texts_to_sequences(texts = test_data[predictor])

In [15]:
X_test = pad_sequences(sequences = X_test, maxlen = MAX_FEATURES, padding = 'pre')
X_test.shape

(5193, 4550)

In [16]:
prediction = model.predict_classes(X_test)

In [17]:
submission = pd.DataFrame({'id':test_data["id"], 'label':prediction})
submission.shape

(5193, 2)

In [18]:
submission.to_csv('submit.csv', index = False)

In [19]:
submission

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,1
4,20804,1
...,...,...
5195,25995,1
5196,25996,0
5197,25997,1
5198,25998,1
