In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [8]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [None]:
import re
from nltk.tokenize import TweetTokenizer  
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import stopwords

eng_stopwords = set(stopwords.words("english"))
lem = WordNetLemmatizer()
tweettokenizer=TweetTokenizer()

def clean(comment):
    #remove new line
    comment=re.sub("\\n"," ",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    #removing multiple space
    comment=' '.join(comment.split())
    
    
    words=tweettokenizer.tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    words=[APPO[word] if word in APPO else word for word in words]
    words=[lem.lemmatize(word, "v") for word in words]
    # words = [w for w in words if not w in eng_stopwords]
    
    clean_sent=" ".join(words)
    # remove any non alphanum,digit character
    #clean_sent=re.sub("\W+"," ",clean_sent)
    #clean_sent=re.sub("  "," ",clean_sent)
    return(clean_sent)

In [3]:
glove_file='./dataset/glove.6B.50d.txt'
train_file='./dataset/train.csv'
test_file='./dataset/test.csv'

train=pd.read_csv(train_file)
test=pd.read_csv(test_file)

sent_train=train["comment_text"].fillna("nan")

classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y=train[classes].values

sent_test=test["comment_text"].fillna("nan")

In [16]:
sent_train[2]

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [None]:
clean(sent_train[2])

In [18]:
clean_sent_train=sent_train.apply(lambda x:clean(x))

In [19]:
clean_sent_train[0]

u'explanation edit make username hardcore metallica fan revert werent vandalisms closure gas vote new york dolls fac please dont remove template talk page since im retire'

In [20]:
clean_sent_test=sent_test.apply(lambda x:clean(x))

In [21]:
clean_train=pd.read_csv(train_file)
clean_test=pd.read_csv(test_file)

clean_train["comment_text"]=clean_sent_train
clean_test["comment_text"]=clean_sent_test

clean_train.to_csv('clean_train.csv', index=False)
clean_test.to_csv('clean_test.csv', index=False)

In [22]:
max_words_count=20000
embedding_size=50
max_words_length=100

tokenizer=Tokenizer(num_words=max_words_count)
tokenizer.fit_on_texts(clean_sent_train)
tokens_train = tokenizer.texts_to_sequences(clean_sent_train)
tokens_test = tokenizer.texts_to_sequences(clean_sent_test)

x_train=pad_sequences(tokens_train,maxlen=max_words_length)
x_test=pad_sequences(tokens_test,maxlen=max_words_length)

def index_to_embed(word,*embedding):
    return word,np.asarray(embedding,dtype='float32')

embed_dict=dict(index_to_embed(*o.strip().split())for o in open(glove_file))

all_embs = np.stack(embed_dict.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

word_idx=tokenizer.word_index

embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words_count, embedding_size))

for word,i in word_idx.items():
    if i < max_words_count:
        vec_temp=embed_dict.get(word)
        if vec_temp is not None:
            embedding_matrix[i]=vec_temp

In [23]:
inp=Input(shape=(max_words_length,))
x=Embedding(max_words_count,embedding_size,weights=[embedding_matrix])(inp)
x=Bidirectional(LSTM(embedding_size,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x=GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

from keras.callbacks import EarlyStopping, ModelCheckpoint
file_path="lstm_preprocess_best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
model.fit(x_train, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks_list)

y_test = model.predict([x_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('./dataset/sample_submission.csv')
sample_submission[classes] = y_test
sample_submission.to_csv('submission/submission_preprocess_lstm.csv', index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


In [24]:
from keras.layers import GRU
inp_1=Input(shape=(max_words_length,))
x_1=Embedding(max_words_count,embedding_size,weights=[embedding_matrix])(inp_1)
x_1=Bidirectional(GRU(embedding_size,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x_1)
x_1=GlobalMaxPool1D()(x_1)
x_1 = Dense(50, activation="relu")(x_1)
x_1 = Dropout(0.1)(x_1)
x_1 = Dense(6, activation="sigmoid")(x_1)
model1 = Model(inputs=inp_1, outputs=x_1)
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

file_path="gru_preprocess_best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
model1.fit(x_train, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks_list)

y_test_1 = model1.predict([x_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('./dataset/sample_submission.csv')
sample_submission[classes] = y_test_1
sample_submission.to_csv('submission/submission_preprocess_GRU.csv', index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


In [25]:
file_lstm='submission/submission_preprocess_lstm.csv'
file_GRU='submission/submission_preprocess_GRU.csv'
p_lstm = pd.read_csv(file_lstm)
p_gru = pd.read_csv(file_GRU)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
p_res_avg = p_lstm.copy()
p_res_avg[label_cols] = (p_gru[label_cols] + p_lstm[label_cols]) / 2

p_res_avg.to_csv('submission_preprocess_lstm+gru_avg.csv', index=False)
