Bag of words MLP model with Keras

In [5]:
import os
import numpy as np
import pandas as pd

## Import data

In [6]:
# load doc into memory
def load_doc(filename):
    # open the file as read only 
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file file.close()
    return text

In [7]:
dir_path = os.path.realpath('..')

In [8]:
path = 'data/interim/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [10]:
path = 'data/interim/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


In [11]:
path = 'data/interim/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
vocab = load_doc(full_path)

## Encoding

In [12]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [13]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [14]:
%%time
tokenizer = create_tokenizer(df_train.comment_text)

CPU times: user 4.48 s, sys: 30.5 ms, total: 4.51 s
Wall time: 4.53 s


In [15]:
%%time
# encode data
Xtrain = tokenizer.texts_to_matrix(df_train.comment_text, mode='freq')
Xtest = tokenizer.texts_to_matrix(df_test.comment_text, mode='freq')

CPU times: user 25.8 s, sys: 3.25 s, total: 29 s
Wall time: 30.1 s


In [16]:
print(Xtrain.shape, Xtest.shape)

(95851, 46666) (226998, 46666)


## Train model

In [17]:
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense

In [24]:
# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(8, input_shape=(n_words,), activation='relu')) 
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    # summarize defined model
    model.summary()
    # plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [25]:
# define the model
n_words = Xtest.shape[1]
model = define_model(n_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 8)                 373336    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 373,345
Trainable params: 373,345
Non-trainable params: 0
_________________________________________________________________


# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0) 
print('Test Accuracy: %f' % (acc*100))

In [26]:
def save_model(model, model_name):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_name + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_name + ".h5")
    print("Saved model to disk")

In [27]:
%%time
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for label in target:
    print('... Processing {}'.format(label))
    ytrain = df_train[label]
    model_name = 'model_' + label
    
    # train the model
    model.fit(Xtrain, ytrain, epochs=1, verbose=2)
    
    # save the model
    save_model(model, model_name)

... Processing toxic
Epoch 1/1
 - 96s - loss: 0.2876 - acc: 0.9084
Saved model to disk
... Processing severe_toxic
Epoch 1/1
 - 96s - loss: 0.0407 - acc: 0.9888
Saved model to disk
... Processing obscene
Epoch 1/1
 - 105s - loss: 0.1354 - acc: 0.9533
Saved model to disk
... Processing threat
Epoch 1/1
 - 100s - loss: 0.0208 - acc: 0.9951
Saved model to disk
... Processing insult
Epoch 1/1
 - 98s - loss: 0.1217 - acc: 0.9567
Saved model to disk
... Processing identity_hate
Epoch 1/1
 - 96s - loss: 0.0395 - acc: 0.9891
Saved model to disk
CPU times: user 9min 50s, sys: 4min 31s, total: 14min 22s
Wall time: 9min 50s
