In [3]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import GRU, LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

import os
import time
import gc
import re

os.environ['OMP_NUM_THREADS'] = '4'

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

del train
del test

In [4]:
#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

# https://www.kaggle.com/prashantkikani/hight-of-preprocessing-with-emoji
# https://www.kaggle.com/prashantkikani/pooled-gru-with-preprocessing

repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

replacements = {**APPO, **repl}

In [5]:
stop_words = set(stopwords.words("english"))
lem = WordNetLemmatizer()

def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    # comment = re.sub("\\n","", comment)
    # remove leaky elements like ip,user
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ip ", comment)
    # replace words like Gooood with Good
    comment = re.sub(r'(\w)\1{2,}', r'\1\1', comment)
    # replace ! and ? with <space>! and <space>? so they can be kept as tokens
    comment = re.sub(r'(!|\?)', " \\1 ", comment)   
        
    #Split the sentences into words
    words=comment.split(' ')
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    words=[replacements[word] if word in replacements else word for word in words]
    #words=[lem.lemmatize(word, pos="v") for word in words]
    words = [w for w in words if not w in stop_words]
    
    clean_sent=" ".join(words)
    # remove any non alphanum,digit character
    #clean_sent=re.sub("\W+"," ",clean_sent)
    #clean_sent=re.sub("  "," ",clean_sent)
    return(clean_sent)

def count_ips(comment): #10081, 756
    return 1 if re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', comment) else 0
    
def count_abbreviations(comment): #67144, 61826
    return np.bitwise_or.reduce([1 if word in replacements else 0 for word in comment.lower().split(' ')], 0)

j = 0
def count_lemmatized(comment):  #138,570, 126,810
    global j
    
    if j > 100: return 0
    
    words = comment.lower().split(' ')
    changed = set([lem.lemmatize(word, pos="v") for word in words]) - set(words)
    if j < 20 and len(changed) > 0: print(changed, set(words) - set([lem.lemmatize(word, pos="v") for word in words])); j = j + 1
    return len(changed) > 0

i = 0
def count_goood(comment): #42345, 60569
    global i
    
    #if i < 20 and re.search(r'(\w)\1{2,}', comment): i = i+1; print(comment)
    return 1 if re.search(r'(\w)\1{2,}', comment) else 0

k = 0
def check_char_in_record(char, comment):
    global k
    
    if k < 20 and char in comment: print(comment); k = k + 1
    return 1 if char in comment else 0    

In [11]:
#train = pd.read_csv('../data/train.csv')
#test = pd.read_csv('../data/test.csv')

#print("Ips count: {}".format(sum(test["comment_text"].apply(lambda x: count_ips(x)))))
#print("Abbeviation count: {}".format(sum(test["comment_text"].apply(lambda x: count_abbreviations(x)))))
#print("Lemmatize count: {}".format(sum(test["comment_text"].apply(lambda x: count_lemmatized(x)))))
#print("Gooood count: {}".format(sum(test["comment_text"].apply(lambda x: count_goood(x)))))

c = '!'
print("Character {} appears: {}".format(c, sum(train["comment_text"].apply(lambda x: check_char_in_record(c, x)))))

Character ! appears: 22821


In [None]:
#train.shape
test.shape

In [9]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

start_time=time.time()

train["comment_text"] = train["comment_text"].apply(lambda x: clean(x))
test["comment_text"] = test["comment_text"].apply(lambda x: clean(x))

end_time=time.time()
print("total time till Cleaning", end_time-start_time)

train.to_csv('../data/train_cleaned.csv', index=False)
test.to_csv('../data/test_cleaned.csv', index=False)

gc.collect()

total time till Cleaning 18.93160104751587


14

In [14]:
print("Ips count: {}".format(sum(train["comment_text"].apply(lambda x: count_ips(x)))))
#print("Abbeviation count: {}".format(sum(train["comment_text"].apply(lambda x: count_abbreviations(x)))))
#print("Lemmatize count: {}".format(sum(train["comment_text"].apply(lambda x: count_lemmatized(x)))))
#print("Gooood count: {}".format(sum(train["comment_text"].apply(lambda x: count_goood(x)))))

c = '!'
print("Character {} appears: {}".format(c, sum(train["comment_text"].apply(lambda x: check_char_in_record(c, x)))))
#train["comment_text"][1]

Ips count: 0


"d'aww !  matches background colour i am seemingly stuck with. thanks.  (talk) 21:51, january 11, 2016 (utc)"

### Sequencing and embedding

In [15]:
embeddings = 'glove' #'glove', 'fasttext

if embeddings == 'fasttext':
    EMBEDDING_FILE = '../data/fasttext/crawl-300d-2M.vec' #'../data/fasttext/crawl-300d-2M.vec'
else:
    EMBEDDING_FILE = '../data/glove/glove.840B.300d.txt'    

max_features = 100000  #100000 , 30000
maxlen = 200
embed_size = 300
prefix = 'c1' #x, #c1

print(EMBEDDING_FILE)

../data/glove/glove.840B.300d.txt


In [20]:
train_file = 'train_cleaned.csv'
test_file = 'test_cleaned.csv'

#train_file = 'train.csv'
#test_file = 'test.csv'

train = pd.read_csv('../data/' + train_file)
test = pd.read_csv('../data/' + test_file)

X_train = train["comment_text"].fillna("fillna").values
X_test = test["comment_text"].fillna("fillna").values

tokenizer = text.Tokenizer(num_words=max_features, filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
gc.collect()

0

In [22]:
max_features, len(word_index) #clean (100000, 392107) original (100000, 394787)
#first2pairs = {k: word_index[k] for k in list(word_index)[:10]}
#first2pairs

(100000, 392107)

In [25]:
import pickle

index_file = 'word_index_cleaned' #word_index_cleaned, word_index

#pickle.dump(word_index, open('../models/' + index_file, 'wb'))
word_index_cleaned = pickle.load(open('../models/' + index_file, 'rb') )

index_file = 'word_index' #word_index_cleaned, word_index

#pickle.dump(word_index, open('../models/' + index_file, 'wb'))
word_index = pickle.load(open('../models/' + index_file, 'rb') )

In [28]:
#len(word_index), len(word_index_cleaned)

print(len(word_index_cleaned.keys()) - len(word_index.keys()))
#diff1 = set(word_index_cleaned.keys()) - set(word_index.keys())
#diff2 = set(word_index.keys()) - set(word_index_cleaned.keys())

print(len(diff1), len(diff2))
#it = iter(diff1)
#next(it)

for i, e in enumerate(diff1):
    print(e)
    if i == 100: break

-2680
3091 5771
1860166
ooioioiknkjbjmb
nomdeplume600
f60e17fb345f12728ddab0a94d1405b8084f1d3
mn008122
frigilmm…
esherraann
iogaukrhjkwerhfrhrfakafyhurkruyiureuierrueirhrcgghhbhghhjdhguhgjhuhtghjhuhjhguhhgduthhghhgjhgk
ffuucckkyyoouu
francs200
buenoo
coo0ntface
kxxr
54005080010160020320040640081280162560325120650241300482600965201930403860807721615443230886461772923545847091694183388366776733553467106934213868427736855473710947421894843789687579375158750317500635001270025400
iiff
sadgg
blahhblahhblahblahblah
220417
ncc'
grrowlss
3326764
anubisii
tomica11
gb008237132gbgbxaimi
annies
jeangabin66
steven88
aajyfvlopugm
jjakegittes
rajukc11
danlt88
4faaibaj
ghdfjghfjghfjghfjgfhgjkkfhfdjghdfgfhhggdjdjdsgdjhdgfhdgdhgehdydhdhdgdfdhsywshasudydfbdfydfdfh
zicfaaibaj
mbrr
satann
texdsgdfgdfgdhgfddghgdgfdgfteretrerterrittcxczxcfhkljdhgvcmjh
aahhtchh
37746428
pmh001734
helloog
üğökppşppçöpp
fsddfssdtrdttghghghghggttkkddxxvv
mm'
cleeaan
bdell55
wheellss
bfogbstqetfsdfaefqff
mommaa
dipstick200
mmbeer


In [35]:
word_index_cleaned['a']

132

In [24]:
import pickle

train_feats_path = '../models/{}_train_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
test_feats_path = '../models/{}_test_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
embedding_matrix_path = '../models/{}_{}_embedding_matrix_feat_{}.pkl'.format(prefix, embeddings, max_features)

pickle.dump(x_train, open(train_feats_path, 'wb'))
pickle.dump(x_test, open(test_feats_path, 'wb'))
pickle.dump(embedding_matrix, open(embedding_matrix_path, 'wb'))

#x_train = pickle.load(open(train_feats_path, 'rb') )
#x_test = pickle.load(open(test_feats_path, 'rb') )
#embedding_matrix = pickle.load(open(embedding_matrix_path, 'rb') )

train = pd.read_csv('../data/train.csv')
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
submission = pd.read_csv('../data/sample_submission.csv')

del train

In [None]:
# for comparing
prefix = 'x' #x, #c1

train_feats_path_2 = '../models/{}_train_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
test_feats_path_2 = '../models/{}_test_feat_{}_seq_{}.pkl'.format(prefix, max_features, maxlen)
embedding_matrix_path_2 = '../models/{}_{}_embedding_matrix_feat_{}.pkl'.format(prefix, embeddings, max_features)

x_train_2 = pickle.load(open(train_feats_path_2, 'rb') )
x_test_2 = pickle.load(open(test_feats_path_2, 'rb') )
embedding_matrix_2 = pickle.load(open(embedding_matrix_path_2, 'rb') )

In [None]:
#merge=pd.concat([X_train,test.iloc[:,0:2]])
#df=merge.reset_index(drop=True)

## Splitting

In [None]:
#y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

sum(y_train[:,0]), sum(y_train[:,0])/len(y_train[:,0])*100

In [None]:
#y_train[y_train[:,0] == 1,:]
y_packed = np.packbits(y_train, axis=1)
len(y), len(y_train[:,0])

In [None]:
len(y_packed[y_packed != 0])
# sum(y_train[:,0] | y_train[:,1]  | y_train[:,2]  | y_train[:,3]  | y_train[:,4]  | y_train[:,5])
#np.unpackbits(y_packed, axis=1)[:,0:6]

In [None]:
n_folds = 5
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 32)

for i, (train_idx, valid_idx) in enumerate(kfold.split(X_train, y_packed)):
    if i > n_folds: break

    print(i, train_idx.shape)
    print("Running fold {} / {}".format(i + 1, n_folds))
    
    model = build_model(units = units, dr = dr, lr_i = lr_i, lr_f = lr_f, batch_size = batch_sizes, epoch = epochs)

    x_train, y_train = X_train[train_idx], Y_train[train_idx] 
    x_valid, y_valid = X_train[valid_idx], Y_train[valid_idx]

    history = model.fit(x_train, y_train, batch_size = batch_sizes, epochs = epochs, validation_data = (x_valid, y_valid), 
                          verbose = 2, callbacks = [check_point, early_stop, ra_val])
    model = load_model(file_path)
    pred += model.predict(X_test, batch_size = batch_sizes, verbose = 1)
    
preds = pred/n_folds    

## TODO

* Include ! 
* Goooood -> Good

### Useful features

* Repeated char acounts (!$?)
* All capital