In [128]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import ModelCheckpoint,EarlyStopping

from gensim.models import KeyedVectors
from gensim.models import FastText as ft

from keras import layers, models, optimizers
from keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D,MaxPooling1D,Dropout,GlobalMaxPool1D,SpatialDropout1D
from small_dataset import get_small_dataset

In [119]:
cwd = os.getcwd()

In [21]:
#to convert glove data to word2vec data
#from gensim.scripts.glove2word2vec import glove2word2vec
#glove_input_file = 'glove.840B.300d.txt'
#word2vec_output_file = 'glove.840B.300d.txt.word2vec'
#glove2word2vec(glove_input_file, word2vec_output_file)

In [55]:
df = get_small_dataset()

In [4]:
X = df["content"]
y = df['label']

In [None]:
#This model has lots of missing vocab
#But this model is taken from news articles, which probably fits the clickbait titles well.
# filename = 'GoogleNews-vectors-negative300.bin'
# embed_model = KeyedVectors.load_word2vec_format(filename, binary=True) 

#This is now depreciated, we will use fasttext

In [90]:
embed_model2 = ft.load_fasttext_format("cc.en.300.bin")

  """Entry point for launching an IPython kernel.


In [96]:
vec_dim = len(embed_model2["cat"])

  """Entry point for launching an IPython kernel.


In [97]:
#pre-process the input matrix X
t = Tokenizer()
t.fit_on_texts(X)
vocab_size = len(t.word_index) + 1
encoded_X = t.texts_to_sequences(X)

max_length = 0
for i in encoded_X:
    max_length = max(len(i), max_length)
    
input_length = max_length + 1

padded_X = pad_sequences(encoded_X, maxlen = input_length,
                         padding = "post")

In [99]:
#Create the embedding matrix
#If the word is not in the vocab, then let it take a vector of all zeros
#Not the best idea, given that so many words are out of the vocab...

embedding_matrix = np.zeros((vocab_size, vec_dim))
for word,i in t.word_index.items():
    embedding_vector = embed_model2[word]
    embedding_matrix[i] = embedding_vector


  import sys


In [100]:
"""
Here we split the training data into training and validation, and print some general statistics about the training data
"""
# percentage of validation data
# percentage = 0.2

# (random_state): we use a fixed random seed so we get the same results every time.
# X_train, X_val, y_train, y_val = train_test_split(padded_X, y, test_size = percentage, random_state=51)

## First model: Just logistic regression

In [113]:
#Define model

model1 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model1.add(e)
model1.add(Flatten())
model1.add(Dense(1, activation = "sigmoid"))

model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### summarize the model
print(model1.summary())

### fit the model
model1.fit(padded_X, y, epochs=10, verbose=1, validation_split = 0.2)

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 5847, 300)         16605900  
_________________________________________________________________
flatten_17 (Flatten)         (None, 1754100)           0         
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 1754101   
Total params: 18,360,001
Trainable params: 1,754,101
Non-trainable params: 16,605,900
_________________________________________________________________
None
Train on 1967 samples, validate on 492 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a073861f28>

## Second model: Neural network with 3 hidden layers, with little regularisation but with dropout

In [142]:
#Hyper-Params
epochs = 50
batch_size = 32 #for stochastic gradient descent
n_dense = 100
dropout = 0.5


#Define model

model2 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model2.add(e)
model2.add(Flatten())
model2.add(Dropout(dropout))
model2.add(Dense(n_dense, kernel_regularizer=l2(0.0005), activation = "relu"))
model2.add(Dense(n_dense, kernel_regularizer=l2(0.0005), activation = "relu"))
model2.add(Dropout(dropout/2))
model2.add(Dense(n_dense, kernel_regularizer=l2(0.0005), activation = "relu"))
model2.add(Dense(1, activation = "sigmoid"))
model2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### summarize the model
print(model2.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model2weights.{epoch:02d}-{val_loss:.2f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = 5)]

### fit the model
model2.fit(padded_X, y, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.2, callbacks = callbacks_list)

Model: "sequential_42"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_42 (Embedding)     (None, 5847, 300)         16605900  
_________________________________________________________________
flatten_28 (Flatten)         (None, 1754100)           0         
_________________________________________________________________
dropout_37 (Dropout)         (None, 1754100)           0         
_________________________________________________________________
dense_83 (Dense)             (None, 100)               175410100 
_________________________________________________________________
dense_84 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_38 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_85 (Dense)             (None, 100)             

<keras.callbacks.callbacks.History at 0x19f8405cf98>

## Third model: 1-D convnet

In [138]:
#Hyper-Params
epochs = 250
batch_size = 32 #for stochastic gradient descent
drop_embed = 0.1

n_dense = 256
dropout = 0.4

n_conv_layer1 = 100
n_conv_layer2 = 160
k_conv = 5

#Define model

model3 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model3.add(e)
model3.add(SpatialDropout1D(drop_embed))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(3))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(3))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(GlobalMaxPool1D())
#model3.add(Dense(n_dense, activation = "relu"))
model3.add(Dropout(dropout))
model3.add(Dense(1, activation = "sigmoid"))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### summarize the model
print(model3.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model3weights.{epoch:02d}-{val_loss:.2f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = 5)]

### fit the model
model3.fit(padded_X, y, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.2, callbacks = callbacks_list)

Model: "sequential_38"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_38 (Embedding)     (None, 5847, 300)         16605900  
_________________________________________________________________
spatial_dropout1d_13 (Spatia (None, 5847, 300)         0         
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 5847, 100)         150100    
_________________________________________________________________
conv1d_43 (Conv1D)           (None, 5847, 100)         50100     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 1949, 100)         0         
_________________________________________________________________
conv1d_44 (Conv1D)           (None, 1949, 100)         50100     
_________________________________________________________________
conv1d_45 (Conv1D)           (None, 1949, 100)       

<keras.callbacks.callbacks.History at 0x1a0ee9e4fd0>