Bag of words MLP model with Keras

In [1]:
import os
import numpy as np
import pandas as pd

## Import data

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only 
    file = open(filename, 'r')
    # read all text
    text = file.read()
    text = text.splitlines()
    # close the file file.close()
    return text

In [3]:
dir_path = os.path.realpath('..')

In [4]:
path = 'data/processed/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [5]:
path = 'data/processed/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


In [6]:
path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
vocab = load_doc(full_path)

## Encoding

In [7]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [8]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [9]:
%%time
tokenizer = create_tokenizer(vocab)

CPU times: user 293 ms, sys: 9.68 ms, total: 303 ms
Wall time: 314 ms


In [10]:
%%time
# encode data
Xtrain = tokenizer.texts_to_matrix(df_train.comment_text, mode='freq')
Xtest = tokenizer.texts_to_matrix(df_test.comment_text, mode='freq')

CPU times: user 10.2 s, sys: 115 ms, total: 10.4 s
Wall time: 10.5 s


In [11]:
print(Xtrain.shape, Xtest.shape)

(95851, 24280) (226998, 24280)


## Train model

In [12]:
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense

In [13]:
# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(8, input_shape=(n_words,), activation='relu')) 
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    # summarize defined model
    model.summary()
    # plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [14]:
# define the model
n_words = Xtest.shape[1]
model = define_model(n_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 194248    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 194,257
Trainable params: 194,257
Non-trainable params: 0
_________________________________________________________________


# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0) 
print('Test Accuracy: %f' % (acc*100))

In [15]:
def save_model(model, model_name):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_name + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_name + ".h5")
    print("Saved model to disk")

In [16]:
%%time
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for label in target:
    print('... Processing {}'.format(label))
    ytrain = df_train[label]
    model_name = 'model_' + label
    
    # train the model
    model.fit(Xtrain, ytrain, epochs=1, verbose=2)
    
    # save the model
    save_model(model, model_name)

... Processing toxic
Epoch 1/1
 - 48s - loss: 0.3355 - acc: 0.9036
Saved model to disk
... Processing severe_toxic
Epoch 1/1
 - 56s - loss: 0.0584 - acc: 0.9899
Saved model to disk
... Processing obscene
Epoch 1/1
 - 57s - loss: 0.2087 - acc: 0.9467
Saved model to disk
... Processing threat
Epoch 1/1
 - 54s - loss: 0.0233 - acc: 0.9968
Saved model to disk
... Processing insult
Epoch 1/1
 - 61s - loss: 0.1988 - acc: 0.9503
Saved model to disk
... Processing identity_hate
Epoch 1/1
 - 56s - loss: 0.0497 - acc: 0.9915
Saved model to disk
CPU times: user 4min 59s, sys: 2min 19s, total: 7min 19s
Wall time: 5min 32s
