Using a LSTM single model to text various cleaning steps and impact on score.

Controls:
- CNN single model
- maxlen: 65
- glove.6B.100D
- epochs: 2
- cv: 3
- max features 20000

In [1]:
import os
import logging

In [2]:
dir_path = os.path.realpath('..')

In [3]:
# Import custom transformers

path = 'src/features'
full_path = os.path.join(dir_path, path)
import sys
sys.path.append(full_path)
from transformers import TextCleaner, KerasProcesser

Using TensorFlow backend.


## Import data

In [4]:
import numpy as np
import pandas as pd

In [5]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [6]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


In [7]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Pre-processing

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
seed = 42
np.random.seed(seed)
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
corpus = 'comment_text'

X = df_train[corpus]
y = df_train[target]


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

## Output test set for stacking

In [20]:
ytest['comment_text'] = Xtest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
path = 'data/processed/stacking.csv'
full_path = os.path.join(dir_path, path)
ytest.to_csv(full_path, header=True, index=True)

## Model fit

In [10]:
from sklearn.model_selection import ParameterGrid
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from keras.callbacks import CSVLogger
from sklearn.metrics import log_loss

In [11]:
def save_model(model, model_path):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_path + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_path + ".h5")
    print("Saved model to disk")

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
import re

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, regex='\S+', remove_digits=False, english_only=False, stop_words=None, lower=True, filters=None):
        self.regex = regex
        self.remove_digits = remove_digits
        self.english_only = english_only
        self.stop_words = stop_words
        self.lower = lower
        self.filters = filters
        
    def transform(self, X, *args):
        tokenizer = RegexpTokenizer(self.regex)
        result = []
        for row in X:
            tokens = tokenizer.tokenize(row)
            if self.filters is not None:
                tokens = [re.sub(self.filters, '', t) for t in tokens]
            if self.lower:
                tokens = [t.lower() for t in tokens]
            if self.remove_digits:
                tokens = [t for t in tokens if not t.isdigit()]
            if self.english_only:
                english_words = set(nltk.corpus.words.words())
                tokens = [t for t in tokens if t in english_words]
            if self.stop_words is not None:
                tokens = [t for t in tokens if not t in self.stop_words]
            tokens = ' '.join(tokens)
            if tokens == '':
            	tokens = 'cleaned'
            result.append(tokens)
        return result
    
    def fit(self, *args):
        return self

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

class KerasProcesser(BaseEstimator, TransformerMixin):
    def __init__(self, num_words, maxlen):
        self.num_words = num_words
        self.maxlen = maxlen
        
    def transform(self, X, *args):
        tokenizer = Tokenizer(num_words=self.num_words)
        tokenizer.fit_on_texts(X)
        result = tokenizer.texts_to_sequences(X)
        result = pad_sequences(result, maxlen=self.maxlen, padding='post')
        return result, tokenizer, self.maxlen
    
    def fit(self, *args):
        return self

In [14]:
from keras.models import Sequential
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

# Function to create model, required for KerasClassifier
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length))
    model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(6, activation='sigmoid'))  #multi-label (k-hot encoding)
    # summarize the model
    model.summary()
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [20]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/home/ec2-user/glove.840B.300d.txt', mode='rt', encoding='utf-8')
for line in f:
	values = line.split(' ')
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))



Loaded 2196016 word vectors.


In [21]:
p = Pipeline([
    ('cleaner', TextCleaner(remove_digits=False, english_only=False, stop_words=None)),
    ('keraser', KerasProcesser(num_words=20000, maxlen=65))#,
])

param_grid = {"keraser__num_words": [20000],
              "keraser__maxlen": [65]
             }

In [22]:
%%time

i = 1

for g in ParameterGrid(param_grid):
    model_name = 'Glove840B300D' + str(i)
    logging.basicConfig(filename=model_name+'.log',level=logging.DEBUG)
    csv_logger = CSVLogger(model_name+'.csv', append=True, separator=';')
    print('{}. {}'.format(i, g))
    logging.info('{}. {}'.format(i, g))
    
    p.set_params(**g)
    padded_train, t, max_length = p.transform(Xtrain)
    vocab_size = len(t.word_index) + 1
    encoded_test = t.texts_to_sequences(Xtest)
    padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
    
    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    # fit model
    model = KerasClassifier(build_fn=create_model, epochs=2, verbose=1)
    print(vocab_size, max_length)
    model.fit(padded_train, ytrain, verbose=1, validation_split=0.1, callbacks=[csv_logger])
    
    # evaluate model on test dataset
    y_pred = model.model.predict(padded_test, verbose=1)
    hold_out_preds = pd.DataFrame(y_pred, index=ytest.index, columns=target)
    losses = []

    for label in target:
        loss = log_loss(ytest[label], hold_out_preds[label])
        losses.append(loss)
        print("{} log loss is {} .".format(label, loss))

    print("Combined log loss: {} .".format(np.mean(losses)))
    logging.info("Combined log loss: {} .".format(np.mean(losses)))
    
    # save the model
    model_path = os.path.join(dir_path, 'models', model_name)
    save_model(model.model, model_path)
    
    # submissions
#     encoded_submission = t.texts_to_sequences(df_test[corpus])
#     padded_submission = pad_sequences(encoded_submission, maxlen=max_length, padding='post')
#     y_submission = model.model.predict(padded_submission, verbose=1)
#     submission = pd.DataFrame(y_submission, index=df_test.index, columns=target)
#     path = 'data/submissions/' + model_name + '.csv'
#     full_path = os.path.join(dir_path, path)
#     submission.to_csv(full_path, header=True, index=True)
    
    i = i + 1

1. {'keraser__maxlen': 65, 'keraser__num_words': 20000}
133352 65
Train on 69012 samples, validate on 7668 samples
Epoch 1/2
Epoch 2/2
toxic log loss is 0.10156495118758965 .
severe_toxic log loss is 0.022527017843871566 .
obscene log loss is 0.052658542926488826 .
threat log loss is 0.011262463734633856 .
insult log loss is 0.06571059407757505 .
identity_hate log loss is 0.020881387189131666 .
Combined log loss: 0.04576749282654844 .
Saved model to disk
CPU times: user 2h 50min 15s, sys: 1h 4min 55s, total: 3h 55min 11s
Wall time: 44min 41s


In [23]:
# submissions
encoded_submission = t.texts_to_sequences(df_test[corpus])
padded_submission = pad_sequences(encoded_submission, maxlen=max_length, padding='post')
y_submission = model.model.predict(padded_submission, verbose=1)
submission = pd.DataFrame(y_submission, index=df_test.index, columns=target)
path = 'data/submissions/' + model_name + '.csv'
full_path = os.path.join(dir_path, path)
submission.to_csv(full_path, header=True, index=True)



In [26]:
model.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 65, 300)           40005600  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 65, 100)           140400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 306       
Total params: 40,151,356
Trainable params: 40,151,356
Non-trainable params: 0
________________________________________________________________