In [68]:
import numpy as np
import pandas as pd
import json
from utils import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from sklearn import preprocessing
%matplotlib inline

In [69]:
truth_df = pd.DataFrame(columns=['id','truthMedian','truthClass','truthMean'])
with open('data/truth.jsonl') as data:
    for labelobj in data:
        truth = json.loads(labelobj)
        truthlabel = {'id': truth['id'], 'truthMedian': truth['truthMedian'], 'truthClass': truth['truthClass'], 'truthMean': truth['truthMean']}
        truth_df = truth_df.append(truthlabel, ignore_index = True)
truth_df.head()   

Unnamed: 0,id,truthMedian,truthClass,truthMean
0,858464162594172928,1.0,clickbait,1.0
1,858462320779026433,0.0,no-clickbait,0.133333
2,858460992073863168,0.333333,no-clickbait,0.4
3,858459539296980995,0.333333,no-clickbait,0.266667
4,858455355948384257,0.0,no-clickbait,0.0


In [70]:
instances_df = pd.DataFrame(columns=['id','postText'])
with open('data/instances.jsonl') as data:
	for instanceobj in data:
		instance = json.loads(instanceobj)
		instancerow = {'id': instance['id'], 'postText': instance['postText']}
		instances_df = instances_df.append(instancerow, ignore_index=True)
instances_df.head()

Unnamed: 0,id,postText
0,858462320779026433,[UK’s response to modern slavery leaving victi...
1,858421020331560960,[this is good]
2,858368123753435136,"[The ""forgotten"" Trump roast: Relive his bruta..."
3,858323428260139008,[Meet the happiest #dog in the world!]
4,858283602626347008,[Tokyo's subway is shut down amid fears over a...


In [103]:
dataset = instances_df.join(truth_df.set_index('id'), on='id')
dataset = dataset.drop(labels='id',axis=1)
for i in range(len(dataset)):
    dataset['postText'].values[i] = dataset['postText'].values[i][0]
dataset['postText'].dropna(inplace=True)
dataset.head()

Unnamed: 0,postText,truthMedian,truthClass,truthMean
0,UK’s response to modern slavery leaving victim...,0.0,no-clickbait,0.133333
1,this is good,1.0,clickbait,1.0
2,"The ""forgotten"" Trump roast: Relive his brutal...",0.333333,no-clickbait,0.466667
3,Meet the happiest #dog in the world!,1.0,clickbait,0.933333
4,Tokyo's subway is shut down amid fears over an...,0.0,no-clickbait,0.0


In [104]:
def toBinary(truthClass):
    if truthClass == 'no-clickbait':
        return 0
    else:
        return 1
dataset['truthClass'] = dataset['truthClass'].apply(toBinary)

def toInteger(truthMedian):
    return round(truthMedian*3)
dataset['truthMedian'] = dataset['truthMedian'].apply(toInteger)

dataset.head()

Unnamed: 0,postText,truthMedian,truthClass,truthMean
0,UK’s response to modern slavery leaving victim...,0,0,0.133333
1,this is good,3,1,1.0
2,"The ""forgotten"" Trump roast: Relive his brutal...",1,0,0.466667
3,Meet the happiest #dog in the world!,3,1,0.933333
4,Tokyo's subway is shut down amid fears over an...,0,0,0.0


In [105]:
import string
import re
def cleanText(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text) 
    return text
dataset['postText'] = dataset['postText'].apply(cleanText)
dataset.head()

Unnamed: 0,postText,truthMedian,truthClass,truthMean
0,uks response to modern slavery leaving victims...,0,0,0.133333
1,this is good,3,1,1.0
2,the forgotten trump roast relive his brutal 20...,1,0,0.466667
3,meet the happiest dog in the world,3,1,0.933333
4,tokyos subway is shut down amid fears over an ...,0,0,0.0


In [106]:
# def count_punc(postText):
#     return len(postText)
# dataset['numOfPunctuation'] =  dataset['postText'].apply(count_punc) - dataset['cleanPostText'].apply(count_punc)
# dataset.drop(dataset[dataset['numOfPunctuation']>15].index , inplace = True)
# dataset = dataset.reset_index()
# numOfPunctuation = dataset[['numOfPunctuation']].values
# min_max_scaler = preprocessing.MinMaxScaler()
# dataset['numOfPunctuationNorm'] = min_max_scaler.fit_transform(numOfPunctuation)
# dataset.head()

In [107]:
# dataset = dataset.drop(['postText'],axis=1)
# dataset = dataset.rename(columns = {'cleanPostText': 'postText'}, inplace = False)
# dataset.head()

In [108]:
# non_clickbait_len = len(dataset[(dataset['truthClass']==1)])
# non_clickbait_without_cb_words_len = len(dataset[(dataset['truthClass']==1) & (dataset['numOfPunctuation']>0)])
# non_clickbait_without_cb_words_len/non_clickbait_len

In [109]:
# from nltk import download
# download('punkt')
# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize
# def stemming(postText):
#     ps = PorterStemmer()
#     sentence = word_tokenize(postText)
#     newsentence = []
#     for word in sentence:
#         newsentence.append(ps.stem(word))
#     return ' '.join(newsentence)
# dataset['stemmingPostText'] = dataset['postText'].apply(stemming)
# #dataset['postText'] = dataset['postText'].apply(stemming)
# dataset.head()

In [110]:
# from collections import Counter
# dataset_cb = dataset[dataset['truthClass'] == 1]
# dataset_ncb = dataset[dataset['truthClass'] == 0]
# cb_words_tuple = Counter(" ".join(dataset_cb["stemmingPostText"]).split()).most_common(300)
# cb_words = [words for (words, count) in cb_words_tuple] 
# non_cb_words_tuple = Counter(" ".join(dataset_ncb["stemmingPostText"]).split()).most_common(350)
# non_cb_words = [words for (words, count) in non_cb_words_tuple] 
# true_cb_words = []
# for i in range(len(cb_words)):
#     word = cb_words[i]
#     if word not in non_cb_words[:50+i] and not word.isnumeric():
#         true_cb_words.append(word)
# #print(len(true_cb_words))
# print(true_cb_words[:20])

In [111]:
# countlist = []
# for index, row in dataset.iterrows(): 
#     words = row["postText"].split()
#     count = 0
#     for word in words:
#         if word in true_cb_words:
#             count += 1 
#     countlist.append(count)
# dataset['clickbaitWords'] = countlist
# numOfCbWords = dataset[['clickbaitWords']].values
# dataset['clickbaitWordsNorm'] = min_max_scaler.fit_transform(numOfCbWords)
# dataset.head()

In [112]:
# The percentage of [#clickbait titles with clickbait words] over [#clickbait title]
# clickbait_len = len(dataset[(dataset['truthClass']==1)])
# clickbait_with_cb_words_len = len(dataset[(dataset['truthClass']==1) & (dataset['clickbaitWords']>0)])
# clickbait_with_cb_words_len/clickbait_len

In [113]:
# The percentage of [#non-clickbait titles without clickbait words] over [#non-clickbait titles]
# non_clickbait_len = len(dataset[(dataset['truthClass']==0)])
# non_clickbait_without_cb_words_len = len(dataset[(dataset['truthClass']==0) & (dataset['clickbaitWords']==0)])
# non_clickbait_without_cb_words_len/non_clickbait_len

In [114]:
# numberCountlist = []
# for index, row in dataset.iterrows(): 
#     words = row["postText"].split()
#     count = 0
#     for word in words:
#         if word.isnumeric():
#             count += 1 
#     numberCountlist.append(count)
# dataset['numOfNumerics'] = numberCountlist
# numOfNumerics = dataset[['numOfNumerics']].values
# dataset['numOfNumericsNorm'] = min_max_scaler.fit_transform(numOfNumerics)
# dataset.head()

In [115]:
# The percentage of [#clickbait titles with numOfnumerics words] over [#clickbait title]
# clickbait_len = len(dataset[(dataset['truthClass']==1)])
# clickbait_with_numerics_len = len(dataset[(dataset['truthClass']==1) & (dataset['numOfNumerics']>0)])
# clickbait_with_numerics_len/clickbait_len

In [116]:
# The percentage of [#non-clickbait titles without numOfnumerics words] over [#non-clickbait titles]
# non_clickbait_len = len(dataset[(dataset['truthClass']==0)])
# non_clickbait_with_numerics_len = len(dataset[(dataset['truthClass']==0) & (dataset['numOfNumerics']>0)])
# non_clickbait_with_numerics_len/non_clickbait_len

In [117]:
# from subject_verb_object_extract import findSVOs, nlp
# def extract(postText):
#     tokens1 = nlp(postText)
#     svos1 = findSVOs(tokens1)
#     return svos1

# dataset['SVO'] = dataset['postText'].apply(extract)
# dataset.head()
# president trump slams reporters use of anonymous sources despite using them himself

In [86]:
# import the glove word embedding file
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B/glove.6B.100d.txt')

In [118]:
# length of titles with longest words
maxLen = 0
for i in range(len(dataset)):
    sentence = dataset["postText"][i]
    if len(sentence.split()) > maxLen:
        maxLen = len(sentence.split())
        maxstr = sentence
maxLen

25

In [119]:
# split the dataset to training and testing set
train, test = train_test_split(dataset, test_size=0.2)
X_train, Y_train, Y_train_mean = np.array(train["postText"].tolist()), np.array(train["truthMedian"].tolist()), np.array(train["truthMean"].tolist())
# positive_test = test[test["truthClass"] == 1].sample(n=900)
# negative_test = test[test["truthClass"] == 0].sample(n=900)
# test = pd.concat([negative_test, positive_test]).sample(frac=1)
X_test, Y_test, Y_test_mean = np.array(test["postText"].tolist()), np.array(test["truthClass"].tolist()), np.array(test["truthMean"].tolist())
print(Y_train.shape)
print(Y_test.shape)

(15630,)
(3908,)


In [120]:
def sentences_to_indices(X, word_to_index, max_len):   
    m = X.shape[0]  # number of training examples
    # Initialize X_indices as a numpy matrix of zeros and the correct shape 
    X_indices = np.zeros((m,max_len))
    for i in range(m):          
        # Convert the ith training sentence in lower case and split is into words
        sentence_words = X[i].lower().split()
        j = 0
        for w in sentence_words:
            if w in word_to_index.keys():
                X_indices[i, j] = word_to_index[w]
                j = j + 1
    return X_indices

In [121]:
Indices = sentences_to_indices(X_train,word_to_index, maxLen)
print("X_Train_indices =\n", Indices.shape)

X_Train_indices =
 (15630, 25)


In [122]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1        
    # define dimensionality of your GloVe word vectors (= 50)
    emb_dim = word_to_vec_map["happy"].shape[0]      
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_len,emb_dim))
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_len,emb_dim,trainable = False)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) 
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

In [123]:
def ClickBait_LSTM(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(input_shape, dtype='int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    # Propagate sentence_indices through your embedding layer
    embeddings = embedding_layer(sentence_indices)   
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    X = LSTM(128, return_sequences=True)(embeddings)
    # dropout
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=True)(X)
    # dropout
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences=False)(X)
    # dropout
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with 2 units
    X = Dense(4)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)  
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices, X) 
    return model

In [124]:
model = ClickBait_LSTM((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 25, 100)           40000100  
_________________________________________________________________
lstm_6 (LSTM)                (None, 25, 128)           117248    
_________________________________________________________________
dropout_6 (Dropout)          (None, 25, 128)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 25, 128)           131584    
_________________________________________________________________
dropout_7 (Dropout)          (None, 25, 128)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)              

In [125]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [126]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train, C = 4)
X_train_indices.shape

(15630, 25)

In [134]:
model.fit(X_train_indices, Y_train_oh, epochs = 20, batch_size = 32, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1d24e2d5280>

In [135]:
def onehot_to_binary(data):
    binary = []
    for i in range(len(data)):
        if 2/3*data[i][3] + 1/3*data[i][2] > 2/3*data[i][0] + 1/3*data[i][1]:
        #if data[i][3] + data[i][2] > data[i][0] + data[i][1]:
            binary.append(1)
        else:
            binary.append(0)
    return binary

In [136]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,roc_auc_score, mean_squared_error 
y_train_pred_oh = model.predict(X_train_indices)
y_train_pred_binary = onehot_to_binary(y_train_pred_oh)
Y_train_binary = onehot_to_binary(Y_train_oh)

print("Training Error")
print('Accuracy %s' % accuracy_score(Y_train_binary, y_train_pred_binary))
print('Precision %s' % precision_score(Y_train_binary, y_train_pred_binary))
print('Recall %s' % recall_score(Y_train_binary, y_train_pred_binary))
print('F1 score: %s' % f1_score(Y_train_binary, y_train_pred_binary))
print('MSE %s' % mean_squared_error(Y_train_mean, y_train_pred_binary))

Training Error
Accuracy 0.9861804222648752
Precision 0.9683328971473436
Recall 0.9749670619235836
F1 score: 0.9716386554621849
MSE 0.08040577237322648


In [137]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
y_pred_onehot = model.predict(X_test_indices)
y_pred_binary = onehot_to_binary(y_pred_onehot)

print("Testing Error")
print('Accuracy %s' % accuracy_score(Y_test, y_pred_binary))
print('Precision %s' % precision_score(Y_test, y_pred_binary))
print('Recall %s' % recall_score(Y_test, y_pred_binary))
print('F1 score: %s' % f1_score(Y_test, y_pred_binary))
print('MSE %s' % mean_squared_error(Y_test_mean, y_pred_binary))


Testing Error
Accuracy 0.8236949846468782
Precision 0.661610268378063
Recall 0.5869565217391305
F1 score: 0.6220515633571037
MSE 0.13014329579868986


In [131]:
print('Minimum MSE %s' % mean_squared_error(Y_test_mean, Y_test))

Minimum MSE 0.07533265097030707


In [132]:
# Error Analysis
for i in range(1000):
    if Y_test[i] - y_pred_binary[i] != 0:
        print(X_test[i])
        print("Actual Label",Y_test[i])
        print("Prediction Lable",y_pred_binary[i])
        print("Prediction",y_pred_onehot[i])
        print("-------------")

diagnostic medical sonographer is the least stressful job of 2017 see more
Actual Label 0
Prediction Lable 1
Prediction [0.03048823 0.40089285 0.5306556  0.03796333]
-------------
the difference between donald trump and justin trudeau in two pictures
Actual Label 1
Prediction Lable 0
Prediction [0.27626538 0.62381214 0.09103101 0.00889147]
-------------
the perfect way to cook rice so that its perfectly fluffy and never sticks to the pan
Actual Label 1
Prediction Lable 0
Prediction [0.37360004 0.5559347  0.06305259 0.00741267]
-------------
for instance lowpotency antibiotic drugs were found to be diluted with paracetamol
thisisit
Actual Label 1
Prediction Lable 0
Prediction [7.3322910e-01 2.6232275e-01 4.1640382e-03 2.8406092e-04]
-------------
135 films got a certificate from censor board
Actual Label 1
Prediction Lable 0
Prediction [0.09521299 0.7942034  0.10587152 0.0047121 ]
-------------
the young women leading change in asia  forbesu30asia
Actual Label 0
Prediction Lable 1
Predi

In [133]:
def test(test_string):
    test_string = cleanText(test_string)
    test = np.array([test_string])
    test_indices = sentences_to_indices(test, word_to_index, max_len = maxLen)
    y_pred_onehot = model.predict(test_indices)
    y_pred_binary = onehot_to_binary(y_pred_onehot)
    if y_pred_binary == [1]:
        return True
    else:
        return False
test_string = "US election 2020: What is the presidential transition"
test(test_string)

False