In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

from load_dataset import get_dataset

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import ModelCheckpoint,EarlyStopping

from gensim.models import KeyedVectors
from gensim.models import FastText as ft

from keras import layers, models, optimizers
from keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D,MaxPooling1D,Dropout,GlobalMaxPool1D,SpatialDropout1D


cwd = os.getcwd()

Using TensorFlow backend.


In [13]:
embed_model2 = ft.load_fasttext_format("cc.en.300.bin")

vec_dim = len(embed_model2["cat"])

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
df = get_dataset(size = "small") #use small for small dataset, large for large dataset. Will print the number of articles loaded.

X = df["content"]
y = df['label']

In [3]:
percentage = 0.2

# (random_state): we use a fixed random seed so we get the same results every time.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = percentage, random_state=51)

##Finding length of longest article
##Yes this is like "cheating" because I'm considering the test data too but I have no choice here...

def get_max_length():
    t = Tokenizer()
    t.fit_on_texts(X)
    encoded_X = t.texts_to_sequences(X)
    max_length = 0
    for i in encoded_X:
        max_length = max(len(i), max_length)
    
    return max_length + 1

input_length = get_max_length()

In [31]:
def pre_process_input(array):
    t = Tokenizer()
    t.fit_on_texts(array)
    vocab_size = len(t.word_index) + 1
    encoded_X = t.texts_to_sequences(array)

    ##input_length is a global variable
    padded_X = pad_sequences(encoded_X, maxlen = input_length,
                         padding = "post")
    
    return (padded_X, t)
    

In [32]:
#Create the embedding matrix using the training data, as well as the padded X_train data
padded_X_train, t_train = pre_process_input(X_train)
vocab_size = len(t_train.word_index) + 1
input_length = len(padded_X_train[0])

embedding_matrix = np.zeros((vocab_size, vec_dim))
for word,i in t_train.word_index.items():
    embedding_vector = embed_model2[word]
    embedding_matrix[i] = embedding_vector
    


  


In [33]:
#Create the padded X_test data
padded_X_test = pre_process_input(X_test)[0]

## First model: Just logistic regression

In [39]:
#Hyper-Params
epochs = 10
batch_size = 32
patience = 3

#Define model
model1 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model1.add(e)
model1.add(Flatten())
model1.add(Dense(1, activation = "sigmoid"))

model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### summarize the model
print(model1.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model1weights.{epoch:02d}-{val_loss:.2f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = patience)]

### fit the model
model1.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.2, callbacks = callbacks_list)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 5847, 300)         14930100  
_________________________________________________________________
flatten_3 (Flatten)          (None, 1754100)           0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1754101   
Total params: 16,684,201
Trainable params: 1,754,101
Non-trainable params: 14,930,100
_________________________________________________________________
None
Train on 1573 samples, validate on 394 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.71731, saving model to C:\Users\Admin\Desktop\CS3244Project\CNN-kh-n-joyce/model1weights.01-0.72.hdf5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.71731
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.71731
Epoch 4/10

Epoch 00004: va

<keras.callbacks.callbacks.History at 0x296aee84748>

In [36]:
### Evaluate the model
print(model1.metrics_names)
model1.evaluate(x=padded_X_test, y=y_test, verbose = 1)

['loss', 'accuracy']


[0.8104413229275526, 0.6463414430618286]

## Second model: Coming soon!

## Third model: 1-D convnet

In [40]:
#Hyper-Params
epochs = 10
batch_size = 32 #for stochastic gradient descent
drop_embed = 0.1

n_dense = 256
dropout = 0.4

n_conv_layer1 = 100
n_conv_layer2 = 160
k_conv = 5

#Define model

model3 = Sequential()
e = Embedding(vocab_size, vec_dim, weights = [embedding_matrix],
              input_length = input_length, trainable = False)
model3.add(e)
model3.add(SpatialDropout1D(drop_embed))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(3))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer1, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(MaxPooling1D(3))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(Conv1D(filters = n_conv_layer2, kernel_size = k_conv, activation = "relu", padding = "same"))
model3.add(GlobalMaxPool1D())
#model3.add(Dense(n_dense, activation = "relu"))
model3.add(Dropout(dropout))
model3.add(Dense(1, activation = "sigmoid"))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### summarize the model
print(model3.summary())

### Create model checkpoint
output_dir = cwd
modelcheckpoint = ModelCheckpoint(filepath = output_dir + "/model3weights.{epoch:02d}-{val_loss:.2f}.hdf5",
                                 monitor='val_loss', verbose=1, save_best_only=True)
callbacks_list = [modelcheckpoint, EarlyStopping(monitor = "val_loss", patience = 5)]

### fit the model
#model3.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.2, callbacks = callbacks_list)

#fit with no checkpointing
model3.fit(padded_X_train, y_train, batch_size = batch_size, epochs=epochs, verbose=1, validation_split = 0.2)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 5847, 300)         14930100  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 5847, 300)         0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 5847, 100)         150100    
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 5847, 100)         50100     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1949, 100)         0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 1949, 100)         50100     
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 1949, 100)        

<keras.callbacks.callbacks.History at 0x296b5fb5128>

In [41]:
### Evaluate the model
print(model3.metrics_names)
model3.evaluate(x=padded_X_test, y=y_test, verbose = 1)

['loss', 'accuracy']


[0.7192409256609474, 0.565040647983551]