In [82]:
# coding: utf-8
import pandas as pd
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text
import jsonlines
import pandas as pd
import nltk
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from unidecode import unidecode
import h5py
import tensorflow as tf
from keras import backend as K

In [83]:
num_cores = 16
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : 1, 'GPU' : 0})
session = tf.Session(config=config)
K.set_session(session)

In [84]:
def readData(instance_path, truth_path):
    instances = []
    truths = []
    
    with jsonlines.open(instance_path) as reader:
        for obj in reader:
            obj['postText'] = obj['postText'][0]
            instances.append(obj)
    
    with jsonlines.open(truth_path) as reader:
        for obj in reader:
            truths.append(obj)
    
    instance = pd.DataFrame.from_dict(instances)
    label = pd.DataFrame.from_dict(truths)
    data = pd.merge(instance, label, on='id')
    
    return data, instance, label

In [85]:
trainData, trainInstance, trainLabel = readData('../data/clickbait17-train-170331/instances.jsonl',
               '../data/clickbait17-train-170331/truth.jsonl')
validData, validInstance, validLabel = readData('../data/clickbait17-validation-170630/instances.jsonl',
                     '../data/clickbait17-validation-170630/truth.jsonl')
data_df = pd.concat([trainData, validData])
instance_df = pd.concat([trainInstance, validInstance])
label_df = pd.concat([trainLabel, validLabel])

In [86]:
data = data_df['postText'].get_values()
label = label_df['truthClass'].get_values()
label[label == 'clickbait'] = 1
label[label == 'no-clickbait'] = 0

In [90]:
X_train = []
X_test = []
y_train = []
y_test = []

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(data, label):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = label[train_index], label[test_index]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
for train_index, valid_index in sss.split(X_train, y_train):
    print("TRAIN:", train_index, "TEST:", valid_index)
    X_train, X_valid = X_train[train_index], X_train[valid_index]
    y_train, y_valid = label[train_index], label[valid_index]

TRAIN: [ 6875 11559 20539 ...,  1168 13390 10493] TEST: [ 9722  9291  9327 ..., 13364   728 15967]
TRAIN: [  635  9588 10713 ...,  7055 16559  9482] TEST: [10472  1637  8258 ..., 16429  9610  3003]


In [91]:
X_train.shape, X_test.shape, X_valid.shape

((15837,), (4400,), (1760,))

In [92]:
tk = text.Tokenizer(num_words=200000)

In [93]:
tk.fit_on_texts(list(X_train) + list(X_test) + list(X_valid))

In [94]:
max_len = 80

X_train_title = tk.texts_to_sequences(X_train)
X_train_title = sequence.pad_sequences(X_train_title, maxlen=max_len)

X_test_title = tk.texts_to_sequences(X_test)
X_test_title = sequence.pad_sequences(X_test_title, maxlen=max_len)

X_valid_title = tk.texts_to_sequences(X_valid)
X_valid_title = sequence.pad_sequences(X_valid_title, maxlen=max_len)

In [95]:
word_index = tk.word_index
ytrain_enc = np_utils.to_categorical(y_train)
ytest_enc = np_utils.to_categorical(y_test)
yvalid_enc = np_utils.to_categorical(y_valid)

In [96]:
model = Sequential()

# What is embedding? what is the dimension?
model.add(Embedding(len(word_index) + 1, 300, input_length=80))
model.add(SpatialDropout1D(0.2))

model.add(LSTM(300, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(200))
model.add(PReLU())
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(200))
model.add(PReLU())
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(2))
model.add(Activation('softmax'))

# adam = Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
hist = model.fit(X_train_title, y=ytrain_enc,
                 batch_size=32, epochs=10, verbose=1, validation_data = (X_valid_title, yvalid_enc),
                 shuffle=True, initial_epoch = 1)

Train on 15837 samples, validate on 1760 samples
Epoch 1/10
Epoch 2/10

In [None]:
def evaluateModel(model, X, y):
    pred = model.predict_proba(X)
    print(classification_report(y[:,1], np.argmax(pred, axis = 1)))
    print(accuracy_score(np.argmax(pred, axis = 1), y[:,1]))

In [None]:
evaluateModel(model, X_train_title, ytrain_enc)

In [None]:
evaluateModel(model, X_test_title, ytest_enc)