In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

from load_dataset_title import get_dataset

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import ModelCheckpoint,EarlyStopping

from gensim.models import KeyedVectors
from gensim.models import FastText as ft

from keras import layers, models, optimizers
from keras.regularizers import l2
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D,MaxPooling1D,Dropout,GlobalMaxPool1D,SpatialDropout1D,AveragePooling1D,GlobalAveragePooling1D


cwd = os.getcwd()

Using TensorFlow backend.


In [7]:
embed_model2 = ft.load_fasttext_format("cc.en.300.bin")

vec_dim = len(embed_model2["cat"])

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [2]:
df = get_dataset(size = "large") #use small for small dataset, large for large dataset. Will print the number of articles loaded.

X = df["title"]
y = df['label']

percentage = 0.2

# (random_state): we use a fixed random seed so we get the same results every time.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = percentage, random_state=51)

loading large document
19539
hi


In [4]:
##Finding length of longest article

def get_max_length():
    t = Tokenizer()
    t.fit_on_texts(X)
    encoded_X = t.texts_to_sequences(X)
    max_length = 0
    for i in encoded_X:
        max_length = max(len(i), max_length)
    
    return max_length + 1

input_length = get_max_length()
print(input_length)

237


In [5]:
def pre_process_input(array):
    t = Tokenizer()
    t.fit_on_texts(array)
    vocab_size = len(t.word_index) + 1
    encoded_X = t.texts_to_sequences(array)

    ##input_length is a global variable
    padded_X = pad_sequences(encoded_X, maxlen = input_length,
                         padding = "post")
    
    return (padded_X, t)
    

In [8]:
#Create the embedding matrix using the training data, as well as the padded X_train data
padded_X_train, t_train = pre_process_input(X_train)
vocab_size = len(t_train.word_index) + 1
input_length = len(padded_X_train[0])

embedding_matrix = np.zeros((vocab_size, vec_dim))
for word,i in t_train.word_index.items():
    embedding_vector = embed_model2[word]
    embedding_matrix[i] = embedding_vector
    


  


In [9]:
#Create the padded X_test data
padded_X_test = pre_process_input(X_test)[0]

## First model: Just logistic regression

In [10]:
#Hyper-Params
epochs = 50
batch_size = 32
patience = 3

#Define model
model1 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model1.add(e)
model1.add(Flatten())
model1.add(Dense(1, activation = "sigmoid"))

model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### summarize the model
print(model1.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model1_titleweights.{epoch:02d}-{val_loss:.3f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = patience)]

### fit the model
model1.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.2, callbacks = callbacks_list)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 237, 300)          7441500   
_________________________________________________________________
flatten_1 (Flatten)          (None, 71100)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 71101     
Total params: 7,512,601
Trainable params: 71,101
Non-trainable params: 7,441,500
_________________________________________________________________
None
Train on 12504 samples, validate on 3126 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.48226, saving model to C:\Users\Admin\Desktop\CS3244Project\CNN-kh-n-joyce/model1_titleweights.01-0.482.hdf5
Epoch 2/50

Epoch 00002: val_loss improved from 0.48226 to 0.47403, saving model to C:\Users\Admin\Desktop\CS3244Project\CNN-kh-n-joyce/model1_titlew

<keras.callbacks.callbacks.History at 0x25a46650c18>

In [11]:
### Evaluate the model
print(model1.metrics_names)
model1.evaluate(x=padded_X_test, y=y_test, verbose = 1)

['loss', 'accuracy']


[0.5595763912586497, 0.7558853626251221]

## Second model: Coming soon!

## Third model: 1-D convnet

In [13]:
#Hyper-Params
epochs = 20
batch_size = 32 #for stochastic gradient descent
drop_embed = 0.5

n_dense = 256
dropout = 0.5

n_conv_layer1 = 100
n_conv_layer2 = 150
n_conv_layer3 = 150
n_conv_layer4 = 150
#n_conv_layer5 = 100
k_conv = 3

#Define model

model3 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model3.add(e)
model3.add(SpatialDropout1D(drop_embed))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(2))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(2))
model3.add(Conv1D(filters = n_conv_layer3, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer3, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(2))
model3.add(Conv1D(filters = n_conv_layer4, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer4, kernel_size = k_conv, activation = "relu", padding = "same"))
#model3.add(MaxPooling1D(2))
#model3.add(Conv1D(filters = n_conv_layer5, kernel_size = k_conv, activation = "relu", padding = "same"))
#model3.add(Conv1D(filters = n_conv_layer5, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(GlobalMaxPool1D())
model3.add(Dropout(dropout))
#model3.add(Dense(n_dense, activation = "relu"))
#model3.add(Dropout(dropout))
model3.add(Dense(1, activation = "sigmoid"))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
#              loss = "mean_squared_error",
              metrics=['accuracy'])

### summarize the model
print(model3.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model3_titleweights.{epoch:02d}-{val_loss:.3f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = 5)]

#callbacks_list_no_chkpt = [EarlyStopping(monitor = "val_loss", patience = 2)]

### fit the model
model3.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.2, callbacks = callbacks_list)

#fit with no checkpointing
#model3.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.1, callbacks = callbacks_list_no_chkpt)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 237, 300)          7441500   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 237, 300)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 237, 100)          90100     
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 237, 100)          30100     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 118, 100)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 118, 150)          45150     
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 118, 150)         

<keras.callbacks.callbacks.History at 0x25a50923550>

In [14]:
### Evaluate the best model
model3_loaded = load_model("model3_titleweights.06-0.459.hdf5")
print(model3_loaded.metrics_names)
model3_loaded.evaluate(x=padded_X_test, y=y_test, verbose = 1)

['loss', 'accuracy']


[0.5529692149516138, 0.7589560151100159]