Using a LSTM single model to text various cleaning steps and impact on score.

Controls:
- CNN single model
- maxlen: 65
- min occurance vocab: 5
- glove.6B.100D
- epochs: 2
- cv: 3
- max features 20000

In [1]:
import os
import logging

In [2]:
dir_path = os.path.realpath('..')

In [3]:
# Import custom transformers

path = 'src/features'
full_path = os.path.join(dir_path, path)
import sys
sys.path.append(full_path)
from transformers import TextCleaner, KerasProcesser

Using TensorFlow backend.


## Import data

In [4]:
import numpy as np
import pandas as pd

In [5]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [6]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


In [7]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Pre-processing

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
seed = 42
np.random.seed(seed)
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
corpus = 'comment_text'

X = df_train[corpus]
y = df_train[target]


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

In [10]:
max_features=20000
max_length=65

## Model fit

In [11]:
from sklearn.model_selection import ParameterGrid
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from keras.callbacks import CSVLogger
from sklearn.metrics import log_loss

In [12]:
from keras.models import Sequential
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

# Function to create model, required for KerasClassifier
def create_model(optimizer='adam', max_features=max_features, max_length=max_length):
    model = Sequential()
    model.add(Embedding(max_features, 100, input_length=max_length))
    model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(6, activation='sigmoid'))  #multi-label (k-hot encoding)
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
def save_model(model, model_path):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_path + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_path + ".h5")
    print("Saved model to disk")

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
import re

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, regex='\S+', remove_digits=False, english_only=False, stop_words=None, lower=True, filters=None):
        self.regex = regex
        self.remove_digits = remove_digits
        self.english_only = english_only
        self.stop_words = stop_words
        self.lower = lower
        self.filters = filters
        
    def transform(self, X, *args):
        tokenizer = RegexpTokenizer(self.regex)
        result = []
        for row in X:
            tokens = tokenizer.tokenize(row)
            if self.filters is not None:
                tokens = [re.sub(self.filters, '', t) for t in tokens]
            if self.lower:
                tokens = [t.lower() for t in tokens]
            if self.remove_digits:
                tokens = [t for t in tokens if not t.isdigit()]
            if self.english_only:
                english_words = set(nltk.corpus.words.words())
                tokens = [t for t in tokens if t in english_words]
            if self.stop_words is not None:
                tokens = [t for t in tokens if not t in self.stop_words]
            tokens = ' '.join(tokens)
            if tokens == '':
            	tokens = 'cleaned'
            result.append(tokens)
        return result
    
    def fit(self, *args):
        return self

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

class KerasProcesser(BaseEstimator, TransformerMixin):
    def __init__(self, num_words, maxlen):
        self.num_words = num_words
        self.maxlen = maxlen
        
    def transform(self, X, *args):
        tokenizer = Tokenizer(self.num_words, lower=False, filters='')
        tokenizer.fit_on_texts(X)
        # vocab_size = len(tokenizer.word_index) + 1
        result = tokenizer.texts_to_sequences(X)
        result = pad_sequences(result, maxlen=self.maxlen, padding='post')
        return result, tokenizer
    
    def fit(self, *args):
        return self

In [16]:
model = KerasClassifier(build_fn=create_model, epochs=2, verbose=1)

In [17]:
p = Pipeline([
    ('cleaner', TextCleaner()),
    ('keraser', KerasProcesser(num_words=max_features, maxlen=max_length))#,
])

param_grid = {"cleaner__regex": ['\S+'],
              "cleaner__remove_digits": [False],
              "cleaner__english_only": [False],
              "cleaner__stop_words": [None],
              "cleaner__filters": [r'[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n]'],
              "cleaner__lower": [True],
              "keraser__num_words": [max_features],
              "keraser__maxlen": [max_length]
             }

In [18]:
%%time

i = 1

for g in ParameterGrid(param_grid):
    model_name = 'grid_' + str(i)
    logging.basicConfig(filename=model_name+'.log',level=logging.DEBUG)
    csv_logger = CSVLogger(model_name+'.csv', append=True, separator=';')
    print('{}. {}'.format(i, g))
    
    p.set_params(**g)
    padded_train, t = p.transform(Xtrain)
    encoded_test = t.texts_to_sequences(Xtest)
    padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
    model.fit(padded_train, ytrain, verbose=1, callbacks=[csv_logger])
    
    # evaluate model on test dataset
    y_pred = model.predict_proba(padded_test, verbose=1)
    hold_out_preds = pd.DataFrame(y_pred, index=ytest.index, columns=target)
    losses = []

    for label in target:
        loss = log_loss(ytest[label], hold_out_preds[label])
        losses.append(loss)
        print("{} log loss is {} .".format(label, loss))

    print("Combined log loss: {} .".format(np.mean(losses)))
    
    # save the model
    model_path = os.path.join(dir_path, 'models', model_name)
    save_model(model.model, model_path)
    
    # submissions
    encoded_submission = t.texts_to_sequences(df_test[corpus])
    padded_submission = pad_sequences(encoded_submission, maxlen=max_length, padding='post')
    y_submission = model.predict_proba(padded_submission, verbose=1)
    submission = pd.DataFrame(y_submission, index=df_test.index, columns=target)
    path = 'data/submissions/' + model_name + '.csv'
    full_path = os.path.join(dir_path, path)
    submission.to_csv(full_path, header=True, index=True)
    
    i = i + 1

1. {'cleaner__english_only': False, 'cleaner__filters': '[!"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n]', 'cleaner__lower': True, 'cleaner__regex': '\\S+', 'cleaner__remove_digits': False, 'cleaner__stop_words': None, 'keraser__maxlen': 65, 'keraser__num_words': 20000}
Epoch 1/2
Epoch 2/2
toxic log loss is 0.15848096213519694 .
severe_toxic log loss is 0.042444288585190504 .
obscene log loss is 0.10191033820340938 .
threat log loss is 0.016362210122636837 .
insult log loss is 0.111508092123731 .
identity_hate log loss is 0.032494380135582854 .
Combined log loss: 0.0772000452176246 .
Saved model to disk
CPU times: user 1h 7min 49s, sys: 8min 40s, total: 1h 16min 30s
Wall time: 12min 11s


In [23]:
submission

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6044863,0.005247,1.891804e-05,0.001366,2.077233e-05,0.000644,0.000144
6102620,0.000906,1.028812e-06,0.000300,4.490527e-07,0.000088,0.000011
14563293,0.004032,4.582534e-06,0.000757,3.926712e-06,0.000587,0.000066
21086297,0.007348,2.681844e-05,0.001776,2.987232e-05,0.001171,0.000235
22982444,0.184843,2.204332e-03,0.030813,3.226800e-03,0.043333,0.011476
24388733,0.000150,1.451989e-07,0.000068,4.980582e-08,0.000011,0.000002
26195914,0.006653,1.279643e-05,0.001468,9.306658e-06,0.001113,0.000147
31769073,0.006708,1.999508e-05,0.001528,2.029322e-05,0.001116,0.000194
35289443,0.376515,4.511880e-03,0.069746,7.257162e-03,0.111367,0.022983
38393350,0.017252,4.490855e-05,0.003229,3.448565e-05,0.003556,0.000468
