In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras import optimizers
from sklearn.metrics import roc_auc_score, precision_score, log_loss, accuracy_score
from keras.layers import Input, Dropout, Dense, LSTM,  Embedding, \
GlobalAveragePooling1D, Bidirectional, GlobalMaxPooling1D, SpatialDropout1D
import keras.utils
from keras.callbacks import ModelCheckpoint
from keras.layers.merge import concatenate
import keras.backend as K
from keras.initializers import Constant 
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import warnings
warnings.filterwarnings('ignore')
w2v = KeyedVectors.load("train.model")

In [None]:
def model_rnn(num_words, emb_dim, emb_matrix, max_len):
    rec_units = 128
    input_layer = Input(shape=(max_len,))
    emb_layer = Embedding(num_words + 1,
                                emb_dim,
                                weights=[emb_matrix],
                                input_length=max_len,
                                trainable=False)(input_layer)
    emb_layer = SpatialDropout1D(0.2)(emb_layer)
    layer = Bidirectional(LSTM(rec_units, return_sequences=True))(emb_layer)
    layer = SpatialDropout1D(0.2)(layer)
    layer = Bidirectional(LSTM(rec_units, return_sequences=True))(layer)
    maxpool = GlobalMaxPooling1D()(layer)
    average = GlobalAveragePooling1D()(layer)
    concat = concatenate([maxpool, average], axis=1)
    layer = Dropout(0.4)(concat)
    layer = Dense(rec_units, activation="relu")(layer)
    output_layer = Dense(6, activation="sigmoid")(layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print(model.summary())
    return model  

In [None]:
train = pd.read_csv('processed_train.csv', index_col='id')
test_X = pd.read_csv('processed_test.csv', index_col='id')
test_y = pd.read_csv('test_labels.csv', index_col='id')
types = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
y = train[types]
test_y = test_y[types]
X = train[['comment_text']]

In [None]:
voc = {}
i = 1
emb_matrix = np.zeros((w2v.vectors.shape[0] + 1, w2v.vectors.shape[1]))
for word in w2v.index2word:
    voc[word] = i
    emb_matrix[i] = w2v[word]
    i+=1

In [None]:
max_len = 0

train_features = pd.np.array(X['comment_text'].apply(lambda x: [voc[item] for item in x.split() 
                                                        if item in voc.keys()]))

test_features = pd.np.array(test_X['comment_text'].apply(lambda x: [voc[item] for item in x.split() 
                                                        if item in voc.keys()]))

for arr in train_features:
    max_len = max(max_len, len(arr))
    
train_features = pad_sequences(train_features, maxlen=max_len, padding='post')
    
test_features = pad_sequences(test_features, maxlen=max_len, padding='post')

In [None]:
model = model_rnn(w2v.vectors.shape[0], w2v.vectors.shape[1], emb_matrix, max_len)

In [None]:
сheckpoint = ModelCheckpoint("save/weights_lstm.{epoch:02d}-{val_loss:.2f}.hdf5")
epochs = 3
model.fit(train_features, pd.np.array(y), epochs = epochs, verbose=1, validation_split=0.2, batch_size=128)

In [None]:
model_json = model.to_json()
json_file = open("my_model1.json", "w")
json_file.write(model_json)
json_file.close()
model.save_weights("my_model1.h5")

In [None]:
print('Test')
pred_proba = model.predict(test_features, batch_size= 256)
print('Train')
pred_proba_train = model.predict(train_features, batch_size= 256)

In [None]:
print('Test:')
score_ra = 0
score_ll = 0
for i in range(0, len(types)):
    print(types[i], ':')
    pred = [x[i] for x in pred_proba]
    ra = roc_auc_score(test_y[types[i]], pred)
    ll = log_loss(test_y[types[i]], pred, eps = 1e-7)
    score_ra += ra
    score_ll += ll
    print('roc_auc:', ra)
    print('log_loss:',ll)
print("Score roc_auc:", score_ra/len(types))
print("Score log_loss:", score_ll/len(types))

In [None]:
print('Train:')
score_ra = 0
score_ll = 0
for i in range(0, len(types)):
    print(types[i], ':')
    pred = [x[i] for x in pred_proba_train]
    ra = roc_auc_score(y[types[i]], pred)
    ll = log_loss(y[types[i]], pred, eps = 1e-7)
    score_ra += ra
    score_ll += ll
    print('roc_auc:', ra)
    print('log_loss:',ll)
print("Score roc_auc:", score_ra/len(types))
print("Score log_loss:", score_ll/len(types))

In [None]:
сheckpoint = ModelCheckpoint("save/weights_lstm2.{epoch:02d}-{val_loss:.2f}.hdf5")
epochs = 3
model.fit(train_features, pd.np.array(y), epochs = epochs, verbose=1, validation_split=0.2, batch_size=128)

In [None]:
print('Test')
pred_proba = model.predict(test_features, batch_size=256)
print('Train')
pred_proba_train = model.predict(train_features, batch_size= 256)

In [None]:
print('Test:')
score_ra = 0
score_ll = 0
for i in range(0, len(types)):
    print(types[i], ':')
    pred = [x[i] for x in pred_proba]
    ra = roc_auc_score(test_y[types[i]], pred)
    ll = log_loss(test_y[types[i]], pred, eps = 1e-7)
    score_ra += ra
    score_ll += ll
    print('roc_auc:', ra)
    print('log_loss:',ll)
print("Score roc_auc:", score_ra/len(types))
print("Score log_loss:", score_ll/len(types))

In [None]:
print('Train:')
score_ra = 0
score_ll = 0
for i in range(0, len(types)):
    print(types[i], ':')
    pred = [x[i] for x in pred_proba_train]
    ra = roc_auc_score(y[types[i]], pred)
    ll = log_loss(y[types[i]], pred, eps = 1e-7)
    score_ra += ra
    score_ll += ll
    print('roc_auc:', ra)
    print('log_loss:',ll)
print("Score roc_auc:", score_ra/len(types))
print("Score log_loss:", score_ll/len(types))