In [1]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

# read data

path = '../data/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# train['other'] = 1 - train[label_cols].max(axis=1)
# label_cols.append('other')

print(label_cols)

train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

print(train.head())
print(train.shape[0])

Using TensorFlow backend.


['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
         id                                       comment_text  toxic  \
0  22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1  27450690  "\n\n Please do not vandalize pages, as you di...      0   
2  54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3  77493077  Asking some his nationality is a Racial offenc...      0   
4  79357270  The reader here is not going by my say so for ...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
95851


In [2]:
max_features = 20000
maxlen = 200

list_sentences_train = train['comment_text'].values
list_sentences_test = test["comment_text"].values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)

y = train[label_cols].values

list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print(len(X_t))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [3]:
embed_size = 128
LSTM_units = 50
dense_units = 50
dropout_rate = 0.1

def get_model():
    input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(input)
    x = Bidirectional(LSTM(LSTM_units, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(dense_units, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(len(label_cols), activation="sigmoid")(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()

batch_size = 32
epochs = 2

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, earlystopping]
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)

y_test = model.predict(X_test)

print('done')

Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
done


In [4]:
label_cols_ini = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

sample_submission = pd.read_csv("../data/sample_submission.csv")

sample_submission[label_cols_ini] = y_test #[:, : -1]

sample_submission.to_csv("baseline.csv", index=False)

print('done')

done
