In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

import keras
from keras.preprocessing import sequence,text
from keras.layers import SpatialDropout1D,Bidirectional,Dense,Dropout,concatenate,Conv1D,GlobalAveragePooling1D,GlobalMaxPooling1D,Input
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model,Sequential
from keras.optimizers import Adam
from keras.layers.recurrent import GRU,LSTM
from keras.layers.embeddings import Embedding


stop_words = stopwords.words('english')
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.Q

Using TensorFlow backend.


/kaggle/input/glove840b300dtxt/glove.840B.300d.txt
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip


In [2]:
train_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
test_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")
sub_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip")

In [3]:
def add_features(df):
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

In [4]:
train_df = add_features(train_df)
test_df = add_features(test_df)

features = train_df[['caps_vs_length','words_vs_unique']].fillna(0)
test_features = test_df[['caps_vs_length','words_vs_unique']].fillna(0)

ss = preprocessing.StandardScaler()
ss.fit(np.vstack((features,test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

In [5]:
glove = '../input/glove840b300dtxt/glove.840B.300d.txt'

def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    if file == '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec':
        embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(crawl)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

In [6]:
embed_glove = load_embed(glove)

In [7]:
def build_vocab(sentences, verbose=True):
    vocab = {}
    for sentence in tqdm(sentences, disable=(not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab
                
                

In [8]:
from tqdm import tqdm
sentences = train_df['comment_text'].apply(lambda x : x.split()).values
sentences_test = test_df['comment_text'].apply(lambda x : x.split()).values
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:03<00:00, 50472.57it/s]


In [9]:
import operator

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass
        
    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    
    return sorted_x
            
    

In [10]:
import re
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re

contractions, contractions_re = _get_contractions(contraction_dict)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

In [11]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x : replace_contractions(x))
sentences = train_df['comment_text'].apply(lambda x : x.split()).values
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:03<00:00, 51591.47it/s]


In [12]:
oov = check_coverage(vocab,embed_glove)

100%|██████████| 531781/531781 [00:00<00:00, 626529.47it/s]

Found embeddings for 27.55% of vocab
Found embeddings for  88.26% of all text





In [13]:
test_df['comment_text'] = test_df['comment_text'].apply(lambda x : replace_contractions(x))
sentences_test = test_df['comment_text'].apply(lambda x : x.split()).values

In [14]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√','ⓒ','´','☼','☺','♣','♠',
  'ω', '∇∆∇∆','𒁳','₪','«', ]

In [15]:
def clean_text2(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, ' ')
    return x

In [16]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x : clean_text2(x))
sentences = train_df["comment_text"].apply(lambda x: x.split()).values
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:02<00:00, 54663.85it/s]


In [17]:
oov = check_coverage(vocab,embed_glove)

100%|██████████| 232261/232261 [00:00<00:00, 619957.45it/s]

Found embeddings for 68.49% of vocab
Found embeddings for  98.78% of all text





In [18]:
test_df['comment_text'] = test_df['comment_text'].apply(lambda x : clean_text2(x))
sentences_test = test_df["comment_text"].apply(lambda x: x.split()).values

In [19]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [20]:
train_df["comment_text"] = train_df["comment_text"].apply(lambda x: replace_typical_misspell(x))
sentences = train_df["comment_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:02<00:00, 55269.03it/s]


In [21]:
oov = check_coverage(vocab,embed_glove)

100%|██████████| 232224/232224 [00:00<00:00, 654544.63it/s]


Found embeddings for 68.48% of vocab
Found embeddings for  98.78% of all text


In [22]:
test_df["comment_text"] = test_df["comment_text"].apply(lambda x: replace_typical_misspell(x))
sentences_test = test_df["comment_text"].apply(lambda x: x.split())

In [23]:
oov_words = {'MOTHJER':'mother','BeCauSe':'because','DENEID':'denied','Bastered':'bastard','peNis':'penis','AIDSAIDS':'AIDS','HAAHHAHAHAH':'laugh','PenIS':'penis','pneis':'penis','pennnis':'penis','ahahahahahahahahahahahahahahahahahahaha' : 'laugh'
            ,'POLITCAL': 'political','FooL':'fool','PaTHeTiC':'pathetic','ReSPeCT':'respect','hellor':'hello','AfDs':'AIDS','pensnsnnienSNsn':'penis','F5FFFA': ' ','\u200e': ' ','☎': 'telephone','✉':'mail','2014\u200e':' ','VANDALIZING':'vandalising'
            ,'DIREKTOR': 'director','REVERTING':'revert','Hipocrite':'hypocrite'}

In [24]:
for key,value in oov_words.items():
    #print(key)
    train_df['comment_text'] = train_df['comment_text'].apply( lambda x : x.replace(key,value))
    test_df['comment_text'] = test_df['comment_text'].apply( lambda x : x.replace(key,value))
    
sentences = train_df["comment_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:02<00:00, 55237.32it/s]


In [25]:
oov = check_coverage(vocab,embed_glove)

100%|██████████| 232048/232048 [00:00<00:00, 648437.80it/s]

Found embeddings for 68.54% of vocab
Found embeddings for  98.82% of all text





In [26]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train_df[list_classes].values

In [27]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            #M.append(embeddings_index[w])
            M.append(embed_glove[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [28]:
xtrain,xval,ytrain,yval = train_test_split(train_df.comment_text.str.lower().values,train_y,random_state=42,shuffle=True,test_size=0.2)

In [29]:
feat_tr,feat_val,ytrain_a,yval_a = train_test_split(features,train_y,random_state=42,shuffle=True,test_size=0.2)

In [30]:
#xtrain_glove = [ sent2vec(x) for x in tqdm(xtrain)]
#xval_glove = [ sent2vec(x) for x in tqdm(xval)]

In [31]:
xtest = test_df.comment_text.str.lower().values
token = text.Tokenizer(num_words=100000,lower=True)
max_len = 150

token.fit_on_texts(list(xtrain)+list(xval)+list(xtest))
xtrain_seq = token.texts_to_sequences(xtrain)
xval_seq = token.texts_to_sequences(xval)
xtest_seq = token.texts_to_sequences(xtest)

xtrain_pad = sequence.pad_sequences(xtrain_seq,maxlen=max_len)
xval_pad = sequence.pad_sequences(xval_seq,maxlen=max_len)
xtest_pad = sequence.pad_sequences(xtest_seq,maxlen=max_len)

word_index = token.word_index




In [32]:
embedding_matrix = np.zeros((len(word_index)+1,300))
for word,i in tqdm(word_index.items()):
    embedding_vector = embed_glove.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 356122/356122 [00:01<00:00, 317769.80it/s]


In [33]:
feat_inputs = Input(shape=(features.shape[1], ))
sequence_inputs = Input(shape=(max_len, ))
x = Embedding(len(word_index)+1,300,weights=[embedding_matrix],trainable=False)(sequence_inputs)
x = SpatialDropout1D(0.2)(x)
#x = Bidirectional(LSTM(128,return_sequences=True,recurrent_dropout=0.1,dropout=0.1))(x)
#x = Bidirectional(LSTM(64,return_sequences=True,recurrent_dropout=0.1,dropout=0.1))(x)
#x = Conv1D(64,kernel_size=3,padding="valid",kernel_initializer = "glorot_uniform")(x)
x = Bidirectional(LSTM(40, return_sequences=True))(x)
x, x_h, x_c = Bidirectional(GRU(40, return_sequences=True, return_state = True))(x)  
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool,x_h,max_pool,feat_inputs])
#x = Dense(128,activation='relu')(x)
#x = Dropout(0.3)(x)
preds = Dense(6,activation='sigmoid')(x)

model = Model([sequence_inputs,feat_inputs],preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

earlystop = EarlyStopping(monitor='val_accuracy',patience=5,min_delta=0,verbose=0,mode='max')

filepath = "weights_base.best.hdf5"

model_checkpoint = ModelCheckpoint(filepath,monitor='val_accuracy',save_best_only=True)

In [34]:
model.fit([xtrain_pad,feat_tr],y=ytrain,epochs=20,batch_size=128,verbose=1,validation_data=([xval_pad,feat_val],yval),callbacks=[model_checkpoint,earlystop])

Train on 127656 samples, validate on 31915 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.callbacks.History at 0x7ff341ddbda0>

In [35]:
model.load_weights(filepath)

preds_all = model.predict([xtest_pad,test_features],batch_size=1024,verbose=1)

sub_df[list_classes] = preds_all



In [36]:
sub_df.to_csv('submission_lstm_con.csv',index=False)