In [22]:
import numpy as np
import pandas as pd
import json
from utils import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from sklearn import preprocessing

In [42]:
df = pd.DataFrame(columns=['postText','truthClass'])
with open('data/clickbait_data.txt') as data:
    for sentence in data:
        positive_case = {'truthClass': 1, 'postText': sentence}
        df = df.append(positive_case, ignore_index = True)
df.head()

Unnamed: 0,postText,truthClass
0,Should I Get Bings\n,1
1,Which TV Female Friend Group Do You Belong In\n,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [43]:
with open('data/non_clickbait_data.txt', encoding='utf-8') as data:
    for sentence in data:
        negative_case = {'truthClass': 0, 'postText': sentence}
        df = df.append(negative_case, ignore_index = True)
df = df.sample(frac=1)
df.head()

Unnamed: 0,postText,truthClass
1335,21 Wicked Ways To Trick People Into Eating The...,1
28976,Brandon Jennings Sends Home a Warning From Eur...,0
17387,Computer Experts Unite to Hunt Worm\n,0
11800,Can You Survive 30 Seconds Looking At These Re...,1
24424,Former Emir of Kuwait dies at age 78\n,0


In [44]:
print(df[df['truthClass'] == 0].shape)
print(df[df['truthClass'] == 1].shape)

(16001, 2)
(15999, 2)


In [45]:
df['postText'].dropna(inplace=True)
df['postText'] = df['postText'].apply(cleanText)
df.head()

Unnamed: 0,postText,truthClass
1335,21 wicked ways to trick people into eating the...,1
28976,brandon jennings sends home a warning from eur...,0
17387,computer experts unite to hunt worm\n,0
11800,can you survive 30 seconds looking at these re...,1
24424,former emir of kuwait dies at age 78\n,0


In [46]:
# length of titles with longest words
maxLen = maxLengthInPostText(df)
maxLen

26

In [47]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B/glove.6B.50d.txt')

In [48]:
train, test = train_test_split(df, test_size=0.1)
X_train, Y_train = np.array(train["postText"].tolist()), np.array(train["truthClass"].tolist())
X_test, Y_test = np.array(test["postText"].tolist()), np.array(test["truthClass"].tolist())
print(X_train.shape)
print(X_test.shape)

(28800,)
(3200,)


In [49]:
Indices = sentences_to_indices(X_train,word_to_index, maxLen)
print("X_Train_indices =\n", Indices.shape)

X_Train_indices =
 (28800, 26)


In [50]:
def ClickBait_BiLSTM(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(input_shape, dtype='int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    # Propagate sentence_indices through your embedding layer
    embeddings = embedding_layer(sentence_indices)   
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    X = Bidirectional(LSTM(128, return_sequences=True))(embeddings)
    # dropout
    X = Dropout(0.5)(X)
    X = Bidirectional(LSTM(128, return_sequences=True))(X)
    # dropout
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = Bidirectional(LSTM(128, return_sequences=False))(X)
    # dropout
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with 2 units
    X = Dense(2)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)  
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices, X) 
    return model

In [51]:
model = ClickBait_BiLSTM((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 26)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 26, 50)            20000050  
_________________________________________________________________
bidirectional_6 (Bidirection (None, 26, 256)           183296    
_________________________________________________________________
dropout_6 (Dropout)          (None, 26, 256)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 26, 256)           394240    
_________________________________________________________________
dropout_7 (Dropout)          (None, 26, 256)           0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 256)              

In [52]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train, C = 2)
X_train_indices.shape

(28800, 26)

In [54]:
model.fit(X_train_indices, Y_train_oh, epochs = 10, batch_size = 64, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20b3001ac70>

In [55]:
def onehot_to_binary(data):
    binary = []
    for i in range(len(data)):
        binary.append(1) if data[i][1]> data[i][0] else binary.append(0)
    return binary

In [56]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,roc_auc_score, mean_squared_error 
y_train_pred_oh = model.predict(X_train_indices)
y_train_pred_binary = onehot_to_binary(y_train_pred_oh)
Y_train_binary = onehot_to_binary(Y_train_oh)

print("Training Error")
print('Accuracy %s' % accuracy_score(Y_train_binary, y_train_pred_binary))
print('Precision %s' % precision_score(Y_train_binary, y_train_pred_binary))
print('Recall %s' % recall_score(Y_train_binary, y_train_pred_binary))
print('F1 score: %s' % f1_score(Y_train_binary, y_train_pred_binary))

Training Error
Accuracy 0.9928819444444444
Precision 0.9886473097564332
Recall 0.9972239572489416
F1 score: 0.9929171129461356


In [57]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
y_pred_onehot = model.predict(X_test_indices)
y_pred_binary = onehot_to_binary(y_pred_onehot)

print("Testing Error")
print('Accuracy %s' % accuracy_score(Y_test, y_pred_binary))
print('Precision %s' % precision_score(Y_test, y_pred_binary))
print('Recall %s' % recall_score(Y_test, y_pred_binary))
print('F1 score: %s' % f1_score(Y_test, y_pred_binary))

Testing Error
Accuracy 0.97
Precision 0.959409594095941
Recall 0.9811320754716981
F1 score: 0.9701492537313432


In [58]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1610
           1       0.96      0.98      0.97      1590

    accuracy                           0.97      3200
   macro avg       0.97      0.97      0.97      3200
weighted avg       0.97      0.97      0.97      3200



In [59]:
# Utilize our model and test the real-world headlines
def test(headline):
    headline = cleanText(headline)
    headline_np = np.array([headline])
    indices = sentences_to_indices(headline_np, word_to_index, max_len = maxLen)
    #y_pred_onehot = lstm_model.predict(test_indices)
    y_pred_onehot = model.predict(indices)
    y_pred_binary = onehot_to_binary(y_pred_onehot)
    return True if y_pred_binary == [1] else False

headline = "Here are 10 things you may not know"
test(headline)

True