In [1]:
import numpy as np
import pandas as pd
from keras import models, layers, Model
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
print(train_df.shape)
print(test_df.shape)

(1804874, 45)
(97320, 2)


In [4]:
train_df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [5]:
train_df = train_df[['id','comment_text','target']]

In [6]:
# set index
train_df.set_index('id', inplace=True)
test_df.set_index('id', inplace=True)

In [7]:
# y_label
train_y_label = np.where(train_df['target'] >= 0.5, 1, 0)
train_df.drop(['target'], axis=1, inplace=True)

## Clean Punctuation

In [8]:
def clean_punc(data):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [9]:
X_train = clean_punc(train_df['comment_text'])
X_test = clean_punc(test_df['comment_text'])

## tokenize

In [10]:
max_words = 100000
max_len = 220

In [11]:
tokenizer = text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# texts_to_sequences
sequences_text_train = tokenizer.texts_to_sequences(X_train)
sequences_text_test = tokenizer.texts_to_sequences(X_test)

# add padding
pad_train = sequence.pad_sequences(sequences_text_train, maxlen=max_len)
pad_test = sequence.pad_sequences(sequences_text_test, maxlen=max_len)

## embedding+lstm layer

In [None]:
# model define
model = models.Sequential()
model.add(layers.Embedding(max_words, 128, input_length=max_len))
model.add(layers.Bidirectional(layers.LSTM(64, dropout=0.5, recurrent_dropout=0.3, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(64, dropout=0.5, recurrent_dropout=0.3, return_sequences=True)))

model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(16, activation='relu'))
model.add(layers.BatchNormalization())

model.add(layers.Dense(1, activation='sigmoid'))

In [12]:
model = models.Sequential()
model.add(layers.Embedding(max_words, 128, input_length=max_len))
model.add(layers.Bidirectional(layers.CuDNNLSTM(64, return_sequences=True)))
model.add(layers.Dropout(0.5))
model.add(layers.BatchNormalization())
model.add(layers.Bidirectional(layers.CuDNNLSTM(64, return_sequences=True)))
model.add(layers.Dropout(0.5))
model.add(layers.BatchNormalization())

model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(16, activation='relu'))
model.add(layers.BatchNormalization())

model.add(layers.Dense(1, activation='sigmoid'))

In [13]:
# model compile
model.compile(optimizer='adam',
             loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

# keras.callbacks
callbacks_list = [ReduceLROnPlateau(
                        monitor='val_loss', patience=2, factor=0.2),	# val_loss가 patience동안 향상되지 않으면 학습률을 0.2만큼 감소 (new_lr = lr * factor)
                    ModelCheckpoint(
                        filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 220, 128)          12800000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 220, 128)          99328     
_________________________________________________________________
dropout_1 (Dropout)          (None, 220, 128)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 220, 128)          512       
_________________________________________________________________
bidirectional_2 (Bidirection (None, 220, 128)          99328     
_________________________________________________________________
dropout_2 (Dropout)          (None, 220, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 28160)             0         
__________

In [14]:
history = model.fit(pad_train, train_y_label,
                     epochs=5, batch_size=1024,
                     callbacks=callbacks_list, 
                     validation_split=0.3)

Train on 1263411 samples, validate on 541463 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

KeyboardInterrupt: 

## predict test set

In [13]:
test_pred = model.predict(pad_test)

In [14]:
sample_result = pd.DataFrame()
sample_result['id'] = test_df.index
sample_result['prediction'] = test_pred

In [15]:
# save sample_submission.csv
sample_result.to_csv('submission.csv', index=False)