In [31]:
import numpy as np
import pandas as pd
import json
from utils import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from sklearn import preprocessing

In [32]:
truth_df = pd.DataFrame(columns=['id','truthMedian','truthClass','truthMean'])
with open('data/truth.jsonl') as data:
    for labelobj in data:
        truth = json.loads(labelobj)
        truthlabel = {'id': truth['id'], 'truthMedian': truth['truthMedian'], 'truthClass': truth['truthClass'], 'truthMean': truth['truthMean']}
        truth_df = truth_df.append(truthlabel, ignore_index = True)
truth_df.head()  

Unnamed: 0,id,truthMedian,truthClass,truthMean
0,858464162594172928,1.0,clickbait,1.0
1,858462320779026433,0.0,no-clickbait,0.133333
2,858460992073863168,0.333333,no-clickbait,0.4
3,858459539296980995,0.333333,no-clickbait,0.266667
4,858455355948384257,0.0,no-clickbait,0.0


In [33]:
instances_df = pd.DataFrame(columns=['id','postText'])
with open('data/instances.jsonl') as data:
	for instanceobj in data:
		instance = json.loads(instanceobj)
		instancerow = {'id': instance['id'], 'postText': instance['postText']}
		instances_df = instances_df.append(instancerow, ignore_index=True)
instances_df.head()

Unnamed: 0,id,postText
0,858462320779026433,[UK’s response to modern slavery leaving victi...
1,858421020331560960,[this is good]
2,858368123753435136,"[The ""forgotten"" Trump roast: Relive his bruta..."
3,858323428260139008,[Meet the happiest #dog in the world!]
4,858283602626347008,[Tokyo's subway is shut down amid fears over a...


In [34]:
dataset = instances_df.join(truth_df.set_index('id'), on='id')
dataset = dataset.drop(labels='id',axis=1)
for i in range(len(dataset)):
    dataset['postText'].values[i] = dataset['postText'].values[i][0]
dataset['postText'].dropna(inplace=True)
dataset.head()

Unnamed: 0,postText,truthMedian,truthClass,truthMean
0,UK’s response to modern slavery leaving victim...,0.0,no-clickbait,0.133333
1,this is good,1.0,clickbait,1.0
2,"The ""forgotten"" Trump roast: Relive his brutal...",0.333333,no-clickbait,0.466667
3,Meet the happiest #dog in the world!,1.0,clickbait,0.933333
4,Tokyo's subway is shut down amid fears over an...,0.0,no-clickbait,0.0


In [35]:
# Convert the 'no-clickbait' or 'clickbait' to binary indicator
dataset['truthClass'] = dataset['truthClass'].apply(classToBinary)
# Convert floating number in 'truthMedian' column to integer
dataset['truthMedian'] = dataset['truthMedian'].apply(medianToInteger)
# Remove all punctuations and clear the text
dataset['postText'] = dataset['postText'].apply(cleanText)
dataset.head()

Unnamed: 0,postText,truthMedian,truthClass,truthMean
0,uks response to modern slavery leaving victims...,0,0,0.133333
1,this is good,3,1,1.0
2,the forgotten trump roast relive his brutal 20...,1,0,0.466667
3,meet the happiest dog in the world,3,1,0.933333
4,tokyos subway is shut down amid fears over an ...,0,0,0.0


In [36]:
# import the glove word embedding file
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B/glove.6B.100d.txt')

In [37]:
# length of titles with longest words
maxLen = maxLengthInPostText(dataset)
maxLen

25

In [38]:
# split the dataset to training and testing set
train, test = train_test_split(dataset, test_size=0.1)
X_train, Y_train, Y_train_mean = np.array(train["postText"].tolist()), np.array(train["truthMedian"].tolist()), np.array(train["truthMean"].tolist())
X_test, Y_test, Y_test_mean = np.array(test["postText"].tolist()), np.array(test["truthClass"].tolist()), np.array(test["truthMean"].tolist())
print(Y_train.shape)
print(Y_test.shape)

(17584,)
(1954,)


In [39]:
Indices = sentences_to_indices(X_train,word_to_index, maxLen)
print("X_Train_indices =\n", Indices.shape)

X_Train_indices =
 (17584, 25)


In [40]:
def ClickBait_BiGRU(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(input_shape, dtype='int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    # Propagate sentence_indices through your embedding layer
    embeddings = embedding_layer(sentence_indices)   
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    X = Bidirectional(GRU(128, return_sequences=True))(embeddings)
    # dropout
    X = Dropout(0.5)(X)
    X = Bidirectional(GRU(128, return_sequences=True))(X)
    # dropout
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = Bidirectional(GRU(128, return_sequences=False))(X)
    # dropout
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with 2 units
    X = Dense(4)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)  
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices, X) 
    return model

In [41]:
BiGRU_model = ClickBait_BiGRU((maxLen,), word_to_vec_map, word_to_index)
BiGRU_model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 25, 100)           40000100  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 25, 128)           63744     
_________________________________________________________________
dropout_3 (Dropout)          (None, 25, 128)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 25, 128)           74496     
_________________________________________________________________
dropout_4 (Dropout)          (None, 25, 128)           0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)              

In [42]:
BiGRU_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [43]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train, C = 4)
X_train_indices.shape

(17584, 25)

In [44]:
BiGRU_model.fit(X_train_indices, Y_train_oh, epochs = 10, batch_size = 32, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1aa07712220>

In [45]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,roc_auc_score, mean_squared_error, classification_report
y_train_pred_oh = BiGRU_model.predict(X_train_indices)
y_train_pred_binary = onehot_to_binary(y_train_pred_oh)
Y_train_binary = onehot_to_binary(Y_train_oh)

print("GRU Training Accuracy")
print('Accuracy %s' % accuracy_score(Y_train_binary, y_train_pred_binary))
print('Precision %s' % precision_score(Y_train_binary, y_train_pred_binary))
print('Recall %s' % recall_score(Y_train_binary, y_train_pred_binary))
print('F1 score: %s' % f1_score(Y_train_binary, y_train_pred_binary))
print('MSE %s' % mean_squared_error(Y_train_mean, y_train_pred_binary))

GRU Training Accuracy
Accuracy 0.8924021838034577
Precision 0.8616957306073362
Recall 0.6668217775709633
F1 score: 0.7518363064008395
MSE 0.09921469011645805


In [46]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
y_pred_onehot = BiGRU_model.predict(X_test_indices)
y_pred_binary = onehot_to_binary(y_pred_onehot)

print("GRU Testing Accuracy")
print('Accuracy %s' % accuracy_score(Y_test, y_pred_binary))
print('Precision %s' % precision_score(Y_test, y_pred_binary))
print('Recall %s' % recall_score(Y_test, y_pred_binary))
print('F1 score: %s' % f1_score(Y_test, y_pred_binary))
print('MSE %s' % mean_squared_error(Y_test_mean, y_pred_binary))

GRU Testing Accuracy
Accuracy 0.827021494370522
Precision 0.691131498470948
Recall 0.48812095032397407
F1 score: 0.5721518987341773
MSE 0.12208347548512362


In [47]:
print('Minimum MSE %s' % mean_squared_error(Y_test_mean, Y_test))

Minimum MSE 0.07841237347732423


In [48]:
print("GRU Testing report")
print(classification_report(Y_test, y_pred_binary))

GRU Testing report
              precision    recall  f1-score   support

           0       0.85      0.93      0.89      1491
           1       0.69      0.49      0.57       463

    accuracy                           0.83      1954
   macro avg       0.77      0.71      0.73      1954
weighted avg       0.82      0.83      0.82      1954



In [49]:
# Save our model
BiGRU_model.save("BiGRU_model.h5")

In [50]:
# Error Analysis
for i in range(10):
    if Y_test[i] - y_pred_binary[i] != 0:
        print(X_test[i])
        print("Actual Label",Y_test[i])
        print("Prediction Lable",y_pred_binary[i])
        print("Prediction",y_pred_onehot[i])
        print("-------------")

dont miss out on the bollywood drama read on d
Actual Label 1
Prediction Lable 0
Prediction [0.5113311  0.41993183 0.06403711 0.00469991]
-------------


In [51]:
# Utilize our model and test the real-world headlines
def test(headline):
    headline = cleanText(headline)
    headline_np = np.array([headline])
    indices = sentences_to_indices(headline_np, word_to_index, max_len = maxLen)
    #y_pred_onehot = lstm_model.predict(test_indices)
    y_pred_onehot = BiGRU_model.predict(indices)
    y_pred_binary = onehot_to_binary(y_pred_onehot)
    return True if y_pred_binary == [1] else False

In [52]:
headline = "Here are 10 things you may not know"
test(headline)

True