# Introduction.

These are some (modest) attempts at participating in Jigsaw's toxic comments classification problem. For now, I am not using any external data, only the training data given (which is limiting as it's a tiny dataset).

In [116]:
import gensim
import keras
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Data loading.

In [None]:
df = pd.read_csv('data/train.csv', index_col='id')
df_test = pd.read_csv('data/test.csv', index_col='id')
# One test input is missing data, so we will just replace it by an empty string.
df_test['comment_text'].fillna('', inplace=True)

In [None]:
df.head()

In [None]:
len(df)

# Pre-processing.

In [None]:
simple_tokens = df.comment_text.apply(gensim.utils.simple_preprocess)

In [None]:
phrases = gensim.models.phrases.Phrases(simple_tokens)
tokenizer = gensim.models.phrases.Phraser(phrases)

In [None]:
tokenized_text = list(tokenizer[simple_tokens])

In [None]:
tokenized_text[0]

In [None]:
corpus_dict = gensim.corpora.dictionary.Dictionary(tokenized_text)

In [None]:
TARGET_CLASSES = df.columns[1:]
targets = df[TARGET_CLASSES].values

# Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.distplot([len(doc) for doc in tokenized_text], bins=100, kde=False, label='Number of tokens per comment.')
plt.xlabel("Tokens in a comment")
plt.ylabel("Frequency")
plt.xlim((0, 400))

# Training word2vec on comment data.

In [None]:
word2vec = gensim.models.word2vec.Word2Vec(tokenized_text, window=5, size=100, min_count=2, workers=6)

In [117]:
word2vec.wv.most_similar('citation')

[('reference', 0.8449319005012512),
 ('references', 0.794470489025116),
 ('citations', 0.7923718690872192),
 ('source', 0.7737321257591248),
 ('reliable_source', 0.7457174062728882),
 ('secondary_source', 0.7404307723045349),
 ('refs', 0.7209465503692627),
 ('ref', 0.7174696922302246),
 ('sources', 0.7153703570365906),
 ('text', 0.7108933925628662)]

In [120]:
word2vec.wv.most_similar('stupid')

[('dumb', 0.8813621997833252),
 ('pathetic', 0.8504430055618286),
 ('fucking', 0.8355610370635986),
 ('silly', 0.7887582182884216),
 ('dude', 0.7864328026771545),
 ('crazy', 0.7851026654243469),
 ('bullshit', 0.7817613482475281),
 ('idiot', 0.7728439569473267),
 ('funny', 0.7703394293785095),
 ('bitch', 0.7700645923614502)]

# word2vec-based based model.

Aggregate word embeddings per comment (~ tf-idf weighted averaging), and use that as an input feature in a neural net with one hidden layer.

In [None]:
features = np.zeros((len(tokenized_text), word2vec.vector_size))
for i, tokens in enumerate(tokenized_text):
    tokens = [t for t in tokens if t in word2vec.wv.vocab]
    if tokens:
        features[i, :] = np.mean([word2vec.wv[t] / word2vec.wv.vocab[t].count for t in tokens], axis=0)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(word2vec.vector_size,)))
model.add(Dense(128, activation='relu'))
model.add(Dense(len(TARGET_CLASSES), activation='sigmoid'))
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(features, targets, epochs=10, validation_split=0.1)

# Sequential models

Simply averaging embeddings across all terms in a comment loses interactions that can occur between words, and the importance of their position. Because of this, we will now experiment with position-aware models: LSTM and CNN.

In [None]:
# Note: shifting indices by 1 as index 0 will be used for padding.
docs = [[idx + 1 for idx in corpus_dict.doc2idx(doc)]  for doc in tokenized_text]

In [None]:
MAX_SEQ_LEN = 50
padded_docs = keras.preprocessing.sequence.pad_sequences(docs, maxlen=MAX_SEQ_LEN, truncating='post', value=0)

In [None]:
max_idx = max(c for d in docs for c in d)
max_idx

In [None]:
embeddings = np.array([np.random.normal(size=word2vec.vector_size)]+ # for the '0' padding word
                      [word2vec.wv[corpus_dict[idx]]
                      if corpus_dict[idx] in word2vec.wv.vocab
                      else np.random.normal(size=word2vec.vector_size)
                      for idx in range(max_idx)])

## LSTM (WIP)

We use an LSTM with an embedding layer, and use padded sequences as an input to the model.

In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D

model = Sequential()
model.add(Embedding(max_idx + 1, word2vec.vector_size, input_length=MAX_SEQ_LEN))
model.add(Dropout(0.25))
model.add(LSTM(256, recurrent_initializer='identity'))
model.add(Dense(len(TARGET_CLASSES), activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy')

In [None]:
model.fit(padded_docs, targets, batch_size=256, epochs=10, validation_split=0.1)

## CNN

In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPool1D, Flatten, BatchNormalization

model = Sequential()
model.add(Embedding(max_idx + 1, word2vec.vector_size, input_length=MAX_SEQ_LEN))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Convolution1D(52, 5, padding='same',
                        kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(MaxPool1D())
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Convolution1D(128, 3, padding='same',
                        kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(MaxPool1D())
model.add(Flatten())
model.add(Dense(len(TARGET_CLASSES), activation='sigmoid',
                kernel_regularizer=keras.regularizers.l2(0.02)))
model.compile(Adam(0.001), 'binary_crossentropy')

In [None]:
model.fit(padded_docs, targets, batch_size=512, epochs=20, validation_split=0.1)

# Creating a submission.

In [None]:
def comment_to_sequential_input(comment):
    tokens = tokenizer[gensim.utils.simple_preprocess(comment)]
    t_ids = [corpus_dict.token2id[t] + 1 for t in tokens if t in word2vec.wv.vocab and t in corpus_dict.token2id]
    return keras.preprocessing.sequence.pad_sequences([t_ids], maxlen=MAX_SEQ_LEN)[0]

In [None]:
test_input = [comment_to_sequential_input("You are a jerk you freakin indian.").reshape(1, -1)]
for target_class, score in zip(TARGET_CLASSES, model.predict(test_input)[0]):
    print("{}: {:.2f}%".format(target_class, score * 100))

In [None]:
test_inputs = np.array([comment_to_sequential_input(doc) for doc in df_test.comment_text])

In [None]:
test_outputs = model.predict(test_inputs)

In [None]:
test_outputs[0]

In [None]:
output_df = df_test.reset_index()[['id']].copy()

In [None]:
for i, target_class in enumerate(TARGET_CLASSES):
    output_df[target_class] = test_outputs[:, i]

In [None]:
output_df[output_df.toxic > 0.5].sample(10, random_state=0).merge(df_test.reset_index(), on='id')

In [None]:
output_df.to_csv('submissions/lstm_1.csv', index=False)

# Annex

In [None]:
from keras.models import Sequential
from keras.layers import Convolution1D, Dense

model = Sequential()
model.add(Embedding(word2vec.vector_size))
model.add(Dense(512, activation='relu'))
model.add(Dense(len(TARGET_CLASSES), activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
def tokens_to_embedding(tokens):
    embeddings = [word2vec.wv[t] / word2vec.wv.vocab[t].count for t in tokens if t in word2vec.wv.vocab]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec.vector_size)

def text_to_embedding(text):
    return tokens_to_embedding(tokenizer[gensim.utils.simple_preprocess(text)])

text = 'hello moroccan friend is just a regular message without any insults'
model.predict(text_to_embedding(text).reshape(1, -1))

In [None]:
test_tokens = tokenizer[df_test.comment_text.apply(gensim.utils.simple_preprocess)]

In [None]:
test_features = [tokens_to_embedding(tokens) for tokens in test_tokens]