In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

from load_dataset_post_text import get_dataset

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import ModelCheckpoint,EarlyStopping

from gensim.models import KeyedVectors
from gensim.models import FastText as ft

from keras import layers, models, optimizers
from keras.regularizers import l2
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D,MaxPooling1D,Dropout,GlobalMaxPool1D,SpatialDropout1D,AveragePooling1D,GlobalAveragePooling1D


cwd = os.getcwd()

Using TensorFlow backend.


In [17]:
embed_model2 = ft.load_fasttext_format("cc.en.300.bin")

vec_dim = len(embed_model2["cat"])

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
df = get_dataset(size = "large") #use small for small dataset, large for large dataset. Will print the number of articles loaded.

X = df["postText"].astype(str)
y = df['label']

percentage = 0.2

# (random_state): we use a fixed random seed so we get the same results every time.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = percentage, random_state=51)

loading large document
19485


In [20]:
##Finding length of longest article

def get_max_length():
    t = Tokenizer()
    t.fit_on_texts(X)
    encoded_X = t.texts_to_sequences(X)
    max_length = 0
    for i in encoded_X:
        max_length = max(len(i), max_length)
    
    return max_length + 1

input_length = get_max_length()
print(input_length)

29


In [21]:
def pre_process_input(array):
    t = Tokenizer()
    t.fit_on_texts(array)
    vocab_size = len(t.word_index) + 1
    encoded_X = t.texts_to_sequences(array)

    ##input_length is a global variable
    padded_X = pad_sequences(encoded_X, maxlen = input_length,
                         padding = "post")
    
    return (padded_X, t)
    

In [22]:
#Create the padded X_train data and the padded X_test data
padded_X_train, t_train = pre_process_input(X_train)
vocab_size = len(t_train.word_index) + 1
input_length = len(padded_X_train[0])

padded_X_test = pre_process_input(X_test)[0]

In [23]:
#Create the embedding matrix using the training data
embedding_matrix = np.zeros((vocab_size, vec_dim))
for word,i in t_train.word_index.items():
    embedding_vector = embed_model2[word]
    embedding_matrix[i] = embedding_vector
    

  after removing the cwd from sys.path.


## First model: Just logistic regression

In [28]:
#Hyper-Params
epochs = 50
batch_size = 32
patience = 3

#Define model
model1 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model1.add(e)
model1.add(Flatten())
model1.add(Dense(1, activation = "sigmoid"))

model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### summarize the model
print(model1.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model1_postTextweights.{epoch:02d}-{val_loss:.3f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = patience)]

### fit the model
model1.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.1, callbacks = callbacks_list)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 29, 300)           6977400   
_________________________________________________________________
flatten_2 (Flatten)          (None, 8700)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 8701      
Total params: 6,986,101
Trainable params: 8,701
Non-trainable params: 6,977,400
_________________________________________________________________
None
Train on 14028 samples, validate on 1559 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.42632, saving model to C:\Users\Admin\Desktop\CS3244Project\CNN-kh-n-joyce/model1_postTextweights.01-0.426.hdf5
Epoch 2/50

Epoch 00002: val_loss improved from 0.42632 to 0.41391, saving model to C:\Users\Admin\Desktop\CS3244Project\CNN-kh-n-joyce/model1_post

<keras.callbacks.callbacks.History at 0x29f8dbf0f98>

In [29]:
### Evaluate the model
print(model1.metrics_names)
model1.evaluate(x=padded_X_test, y=y_test, verbose = 1)

['loss', 'accuracy']


[0.5478087533642948, 0.7677700519561768]

## Second model: Coming soon!

## Third model: 1-D convnet

In [32]:
#Hyper-Params
epochs = 20
batch_size = 32 #for stochastic gradient descent
drop_embed = 0.5

n_dense = 256
dropout = 0.5

n_conv_layer1 = 100
n_conv_layer2 = 150
n_conv_layer3 = 150
n_conv_layer4 = 150
#n_conv_layer5 = 100
k_conv = 3

#Define model

model3 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model3.add(e)
model3.add(SpatialDropout1D(drop_embed))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(2))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(2))
model3.add(Conv1D(filters = n_conv_layer3, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer3, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(2))
model3.add(Conv1D(filters = n_conv_layer4, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer4, kernel_size = k_conv, activation = "relu", padding = "same"))
#model3.add(MaxPooling1D(2))
#model3.add(Conv1D(filters = n_conv_layer5, kernel_size = k_conv, activation = "relu", padding = "same"))
#model3.add(Conv1D(filters = n_conv_layer5, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(GlobalMaxPool1D())
model3.add(Dropout(dropout))
#model3.add(Dense(n_dense, activation = "relu"))
#model3.add(Dropout(dropout))
model3.add(Dense(1, activation = "sigmoid"))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
#              loss = "mean_squared_error",
              metrics=['accuracy'])

### summarize the model
print(model3.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model3_postTextweights.{epoch:02d}-{val_loss:.3f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = 5)]

#callbacks_list_no_chkpt = [EarlyStopping(monitor = "val_loss", patience = 2)]

### fit the model
model3.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.1, callbacks = callbacks_list)

#fit with no checkpointing
#model3.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.1, callbacks = callbacks_list_no_chkpt)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 29, 300)           6977400   
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 29, 300)           0         
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 29, 100)           90100     
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 29, 100)           30100     
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 14, 100)           0         
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 14, 150)           45150     
_________________________________________________________________
conv1d_36 (Conv1D)           (None, 14, 150)          

<keras.callbacks.callbacks.History at 0x29fc616d630>

In [37]:
### Evaluate the best model
model3_loaded = load_model("model3_postTextweights.01-0.359.hdf5")
print(model3_loaded.metrics_names)
model3_loaded.evaluate(x=padded_X_test, y=y_test, verbose = 1)

['loss', 'accuracy']


[0.5324315230816427, 0.7836797833442688]