In [5]:
#https://github.com/ankeshanand/deep-clickbait-detection/blob/master/src/combined_model.py
import pandas as pd
import numpy as np
import h5py
import tensorflow as tf
import sys
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Flatten, Input, Dropout, MaxPooling1D, Convolution1D
from keras.layers import LSTM, Lambda, merge, Masking, SimpleRNN, GRU
from keras.layers import Embedding, TimeDistributed
from keras import backend as K
import keras.callbacks

from utils import clean_tweet, tokenize_tweet
from keras.layers import concatenate
np.random.seed(42)


In [6]:
data_clickbait=pd.read_csv('clickbait_data',sep='\n',header=None)
data_noclickbait=pd.read_csv('non_clickbait_data',sep='\n',header=None)

data_clickbait.insert(1,"class",np.ones(15999))
data_noclickbait.insert(1,"class",np.zeros(16001))

data_Final=pd.concat((data_clickbait,data_noclickbait),ignore_index=True)

In [7]:
data_Final = data_Final.sample(frac=1).reset_index(drop=True)
#data_Final=data_Final.sample(frac=1)
data_Final.head()

Unnamed: 0,0,class
0,Filipino activist arrested for disrupting Mani...,0.0
1,"International Board fixes soccer field size, h...",0.0
2,24 Rules For Women On A First Date With A Man,1.0
3,Political fallout from the sacking of Professo...,0.0
4,"Which ""Clueless"" Character Are You Based On Yo...",1.0


In [8]:

data_Final['cleaned_tweet'] = data_Final[0].apply(clean_tweet)
data_Final['tokenized_tweet'] =data_Final[0].apply(tokenize_tweet)

all_txt = ''
for tweet in data_Final['cleaned_tweet'].values:
    all_txt += tweet

chars = set(all_txt)
print(chars)
print('total chars:', len(chars))

{'v', 't', 'á', 'è', 'é', 'r', 'ó', 'ü', 'd', 'h', 'º', 'î', 'ć', '9', 'ś', 'ñ', 'à', '2', 'j', 'w', 'æ', 'č', 'm', 'n', 'í', '3', 'å', 'ū', 'ł', 'ú', '6', 'o', 'ī', 'ä', 'x', 'a', 'l', '4', 'q', 'p', 'ø', ' ', 'i', 'ö', 'k', 'f', '0', 'ß', 'š', '8', 'ę', 'ž', 'ç', 'b', 'c', 's', '1', 'g', '5', 'ã', 'ń', 'e', 'ș', 'y', 'z', '7', 'u'}
total chars: 67


In [9]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(char_indices)
print(indices_char)
maxlen = 250
print(len(data_Final))
split_index = int((2 * len(data_Final)) / 3)
print(split_index)
df_train , df_test = data_Final[:split_index], data_Final[split_index:]

{'v': 0, 't': 1, 'á': 2, 'è': 3, 'é': 4, 'r': 5, 'ó': 6, 'ü': 7, 'd': 8, 'h': 9, 'º': 10, 'î': 11, 'ć': 12, '9': 13, 'ś': 14, 'ñ': 15, 'à': 16, '2': 17, 'j': 18, 'w': 19, 'æ': 20, 'č': 21, 'm': 22, 'n': 23, 'í': 24, '3': 25, 'å': 26, 'ū': 27, 'ł': 28, 'ú': 29, '6': 30, 'o': 31, 'ī': 32, 'ä': 33, 'x': 34, 'a': 35, 'l': 36, '4': 37, 'q': 38, 'p': 39, 'ø': 40, ' ': 41, 'i': 42, 'ö': 43, 'k': 44, 'f': 45, '0': 46, 'ß': 47, 'š': 48, '8': 49, 'ę': 50, 'ž': 51, 'ç': 52, 'b': 53, 'c': 54, 's': 55, '1': 56, 'g': 57, '5': 58, 'ã': 59, 'ń': 60, 'e': 61, 'ș': 62, 'y': 63, 'z': 64, '7': 65, 'u': 66}
{0: 'v', 1: 't', 2: 'á', 3: 'è', 4: 'é', 5: 'r', 6: 'ó', 7: 'ü', 8: 'd', 9: 'h', 10: 'º', 11: 'î', 12: 'ć', 13: '9', 14: 'ś', 15: 'ñ', 16: 'à', 17: '2', 18: 'j', 19: 'w', 20: 'æ', 21: 'č', 22: 'm', 23: 'n', 24: 'í', 25: '3', 26: 'å', 27: 'ū', 28: 'ł', 29: 'ú', 30: '6', 31: 'o', 32: 'ī', 33: 'ä', 34: 'x', 35: 'a', 36: 'l', 37: '4', 38: 'q', 39: 'p', 40: 'ø', 41: ' ', 42: 'i', 43: 'ö', 44: 'k', 45: 'f', 4

In [10]:
df_test.head()

Unnamed: 0,0,class,cleaned_tweet,tokenized_tweet
21333,Chinese chemical plant explosion threatens wat...,0.0,chinese chemical plant explosion threatens wat...,"[chinese, chemical, plant, explosion, threaten..."
21334,A Ham Radio Weekend for Talking to the Moon,0.0,a ham radio weekend for talking to the moon,"[a, ham, radio, weekend, for, talking, to, the..."
21335,Six-year-old Egyptian boy contracts bird flu,0.0,six year old egyptian boy contracts bird flu,"[six, year, old, egyptian, boy, contracts, bir..."
21336,People Are Not Happy With Justin Bieber After ...,1.0,people are not happy with justin bieber after ...,"[people, are, not, happy, with, justin, bieber..."
21337,21 Things You Need For Your Baby If You Were I...,1.0,21 things you need for your baby if you were i...,"[21, things, you, need, for, your, baby, if, y..."


In [11]:
df_train

Unnamed: 0,0,class,cleaned_tweet,tokenized_tweet
0,Filipino activist arrested for disrupting Mani...,0.0,filipino activist arrested for disrupting mani...,"[filipino, activist, arrested, for, disrupting..."
1,"International Board fixes soccer field size, h...",0.0,international board fixes soccer field size ha...,"[international, board, fixes, soccer, field, s..."
2,24 Rules For Women On A First Date With A Man,1.0,24 rules for women on a first date with a man,"[24, rules, for, women, on, a, first, date, wi..."
3,Political fallout from the sacking of Professo...,0.0,political fallout from the sacking of professo...,"[political, fallout, from, the, sacking, of, p..."
4,"Which ""Clueless"" Character Are You Based On Yo...",1.0,which clueless character are you based on your...,"[which, clueless, character, are, you, based, ..."
...,...,...,...,...
21328,This Apple Picking Test Will Determine Your Tr...,1.0,this apple picking test will determine your tr...,"[this, apple, picking, test, will, determine, ..."
21329,Contestant seriously injured during live Germa...,0.0,contestant seriously injured during live germa...,"[contestant, seriously, injured, during, live,..."
21330,16 Honest Confessions From People Wearing Puri...,1.0,16 honest confessions from people wearing puri...,"[16, honest, confessions, from, people, wearin..."
21331,Christmas Vs. Jewish Christmas: Which Will Win,1.0,christmas vs jewish christmas which will win,"[christmas, vs, jewish, christmas, which, will..."


In [12]:
# Oversample training data with the clickbait class
#df_train_clickbait, df_train_no_clickbait = df_train[df_train['clickbait'] == 1], df_train[df_train['clickbait'] == 0]
#oversampled_df_train_clickbait = df_train_clickbait.sample(len(df_train_no_clickbait), replace=True, random_state=42)
#df_train = pd.concat([oversampled_df_train_clickbait, df_train_no_clickbait])

def binarize(x, sz=37):
    return tf.to_float(tf.one_hot(x, sz, on_value=1, off_value=0, axis=-1))

def create_feature_matrix(docs):
    #print("docs= ",docs)
    X = np.ones((len(docs), maxlen), dtype=np.int64) * -1
   # print("X= ",X)
    for i, doc in enumerate(docs):
       # print(i, doc)
        for t, char in enumerate(doc):
          #  print(t, char)
            X[i, t] = char_indices[char]
           # print("X[i,t]= ",X[i,t])
        
    return X

In [13]:
X_train= create_feature_matrix(df_train['cleaned_tweet'])
print(X_train.shape)

(21333, 250)


In [14]:
X_train, X_test = create_feature_matrix(df_train['cleaned_tweet']), create_feature_matrix(df_test['cleaned_tweet'])
y_train, y_test = np.array(df_train['class']), np.array(df_test['class'])
print(y_train[:20])
print(y_train)
print(np.unique(y_test))

[0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1.]
[0. 0. 1. ... 1. 1. 0.]
[0. 1.]


In [15]:
ids = np.arange(len(X_train))
np.random.shuffle(ids)
X_train = X_train[ids]
y_train = y_train[ids]
#print(X_train)
def binarize_outshape(in_shape):
    return in_shape[0], in_shape[1], 37

filter_length = [5, 3, 3]
nb_filter = [196, 196, 300]
pool_length = 2


    


In [17]:
in_sentence = Input(shape=(maxlen,), dtype='int64')
# binarize function creates a onehot encoding of each character index
embedded = Lambda(binarize, output_shape=binarize_outshape)(in_sentence)
# embedded: encodes sentence
for i in range(len(nb_filter)):
    embedded = Convolution1D(nb_filter=nb_filter[i],
                            filter_length=filter_length[i],
                            border_mode='valid',
                            activation='relu',
                            init='glorot_normal',
                            subsample_length=1)(embedded)

    embedded = Dropout(0.1)(embedded)
    embedded = MaxPooling1D(pool_length=pool_length)(embedded)

forward_sent = GRU(64, return_sequences=False, dropout_W=0.3, dropout_U=0.3, consume_less='gpu')(embedded)
backward_sent = GRU(64, return_sequences=False, dropout_W=0.3, dropout_U=0.3, consume_less='gpu', go_backwards=True)(embedded)



  # This is added back by InteractiveShellApp.init_path()
  
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  app.launch_new_instance()


In [19]:
#from keras.engine import merge
#concatenate([x1, x2], axis=-1)
sent_encode = concatenate([forward_sent, backward_sent], axis=-1)
sent_encode = Dropout(0.3)(sent_encode)
output = Dense(1, activation='sigmoid')(sent_encode)

model = Model(input=in_sentence, output=output)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
earlystop_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=1, mode='auto')




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  import sys


In [20]:
batch_size = 64
model.fit(X_train, y_train, batch_size=batch_size, epochs=20, validation_split=0.1, callbacks=[earlystop_cb])
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)

print('Test score:', score)
print('Test accuracy:', acc)


Train on 19199 samples, validate on 2134 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 00018: early stopping
Test score: 0.23806049367771764
Test accuracy: 0.9245336055755615


In [21]:
def predict_classes(model, X_test):
    proba = model.predict(X_test)
    print(proba)
    if proba.shape[-1] > 1:
        return proba.argmax(axis=-1)
    else:
        return (proba > 0.5).astype('int32')

y_pred = predict_classes(model, X_test)
y_scores = model.predict(X_test)
#print(y_pred)
#print(y_scores)

[[7.1048737e-05]
 [4.2374343e-02]
 [3.5193563e-04]
 ...
 [9.9982435e-01]
 [9.9999988e-01]
 [6.5911445e-04]]


In [22]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
roc = roc_auc_score(y_test, y_scores)
print('ROC score:', roc)



ROC score: 0.9766658229361119


In [23]:
metrics = classification_report(y_test, y_pred, digits=4)
print('Classification Report \n')
print (metrics)



Classification Report 

              precision    recall  f1-score   support

         0.0     0.8992    0.9573    0.9274      5367
         1.0     0.9538    0.8913    0.9215      5300

    accuracy                         0.9245     10667
   macro avg     0.9265    0.9243    0.9244     10667
weighted avg     0.9263    0.9245    0.9244     10667



In [24]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix \n')
print (cm)

Confusion Matrix 

[[5138  229]
 [ 576 4724]]
