In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import optimizers
from sklearn.metrics import roc_auc_score, precision_score, log_loss, accuracy_score
from keras.layers import  Dense, GRU, Dropout, Activation, LSTM, Flatten, Embedding
import keras.utils
import keras.backend as K
import keras_metrics as km
import pandas as pd

Using TensorFlow backend.


In [2]:
def model_rnn(num_words, length):
    model = Sequential()
    model.add(Embedding(num_words + 1, output_dim = 32, input_length = length))
    model.add(LSTM(units = 32,return_sequences=True,recurrent_dropout=0.1 )) 
    model.add(Flatten())
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-7)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer,
              metrics=[km.precision()])

    print(model.summary())
    return model

In [3]:
train = pd.read_csv('../datasets/processed_train.csv', index_col='id')
test_X = pd.read_csv('../datasets/processed_test.csv', index_col='id')
test_y = pd.read_csv('../datasets/test_labels.csv', index_col='id') 
X = train[['comment_text']]
y = train[['toxic']]

In [4]:
num_words = 40000
max_len = 0
tfidf_word = TfidfVectorizer(  
    analyzer='word',
    max_df = 0.1, 
    ngram_range=(1, 2),
    max_features=num_words)
tfidf_word.fit(X['comment_text'])

dict_words = tfidf_word.vocabulary_

In [5]:
train_features = pd.np.array(X['comment_text'].apply(lambda x: [dict_words[item] for item in x.split() 
                                                        if item in dict_words.keys()]))

test_features = pd.np.array(test_X['comment_text'].apply(lambda x: [dict_words[item] for item in x.split() 
                                                        if item in dict_words.keys()]))

for arr in train_features:
    max_len = max(max_len, len(arr))
    
train_features = pad_sequences(train_features, maxlen=max_len, padding='post')
    
test_features = pad_sequences(test_features, maxlen=max_len, padding='post')


In [6]:
model = model_rnn(num_words, max_len)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1250, 32)          1280032   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1250, 32)          8320      
_________________________________________________________________
flatten_1 (Flatten)          (None, 40000)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 40000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 40001     
Total params: 1,328,353
Trainable params: 1,328,353
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
epochs = 3
model.fit(train_features , y, epochs = epochs, verbose=1, validation_split=0.2, batch_size=256)

Train on 41335 samples, validate on 10334 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x12760a8d0>

In [8]:
pred_y = model.predict_classes(test_features, batch_size= 256)
pred_proba = model.predict(test_features, batch_size= 256)

In [9]:
print('Test:')
print('roc_auc:', roc_auc_score(test_y, pred_y))
print('precision', precision_score(test_y, pred_y))
print('log_loss:',log_loss(test_y, pred_proba, eps = 1e-7))

Test:
roc_auc: 0.8867789044107055
precision 0.7313019390581718
log_loss: 0.35943097409067776


In [10]:
epochs = 2
model.fit(train_features , y, epochs = epochs, verbose=1, validation_split=0.2, batch_size=256)

Train on 41335 samples, validate on 10334 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x128f8a438>

In [11]:
pred_y = model.predict_classes(test_features, batch_size= 256)
pred_proba = model.predict(test_features, batch_size= 256)

In [12]:
print('Test:')
print('roc_auc:', roc_auc_score(test_y, pred_y))
print('precision', precision_score(test_y, pred_y))
print('log_loss:',log_loss(test_y, pred_proba, eps = 1e-7))

Test:
roc_auc: 0.8807071158092029
precision 0.6951249690670626
log_loss: 0.46963368444337644
