In [17]:
import pandas as pd
import numpy as np
import re, string
from keras.models import Model
from keras.layers import Dense, Input, Dropout, Embedding, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from scipy.sparse import csr_matrix, hstack

In [18]:
PATH = '../data/'

print('reading data')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

print('data cleaning')

tok = TweetTokenizer()
stopword = set(stopwords.words("english"))

def clean(comment):
    text = tok.tokenize(comment)
    text = [w for w in text if not w in stopword]
    text = ' '.join(text)
    if text == '': text = 'na'
    return text

train['comment_text_cleaned'] = train['comment_text_cleaned'].apply(lambda x: clean(x))
test['comment_text_cleaned'] = test['comment_text_cleaned'].apply(lambda x: clean(x))

print('calculations')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

def f(x):
    x = tok.tokenize(x)
    return len(x)

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

reading data
data cleaning
calculations
(159571, 10)
(153164, 4)


In [19]:
print('getting tfidf')
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,3), stop_words='english', max_df=0.9, min_df=100,\
                                   strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=3000)

tfidf_char = TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_df=0.9, min_df=100,\
                             strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=3000)

print('fitting')
tfidf_vectorizer.fit(text.values)
tfidf_char.fit(text.values)
print('transforming train')
train_tfidf_vec = tfidf_vectorizer.transform(train['comment_text_cleaned'].values)
train_tfidf_char = tfidf_char.transform(train['comment_text_cleaned'].values)
print('transforming test')
test_tfidf_vec = tfidf_vectorizer.transform(test['comment_text_cleaned'].values)
test_tfidf_char = tfidf_char.transform(test['comment_text_cleaned'].values)
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting tfidf
fitting
transforming train
transforming test


<159571x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 705757 stored elements in Compressed Sparse Row format>

In [27]:
train_tfidf = hstack([train_tfidf_vec, train_tfidf_char]).tocsr()
test_tfidf = hstack([test_tfidf_vec, test_tfidf_char]).tocsr()

In [28]:
print(train_tfidf.shape)
print(type(train_tfidf))

(159571, 6000)
<class 'scipy.sparse.csr.csr_matrix'>


In [29]:
def pr(y_i, y):
    train_tfidf[y == y_i]
    tmp = train_tfidf[y == y_i].sum(0) + 1
    return tmp / ((y == y_i).sum() + 1)

train_input = []
y = []
test_input = []

for j in label_cols:
    print('set for ' + j)
    tmp_y = train[j].values
    y.append(tmp_y)
    r = np.log(pr(1, tmp_y) / pr(0, tmp_y))
    train_input.append(train_tfidf.multiply(r).tocsr())
    test_input.append(test_tfidf.multiply(r).tocsr())
    
print('done')

set for toxic
set for severe_toxic
set for obscene
set for threat
set for insult
set for identity_hate
done


In [30]:
dropout_rate = 0.2
label = train[label_cols]
def get_micro_model(x, i):
    input = Input(shape=(x[i].shape[1],), sparse=True)
    x = Dense(20, activation='elu')(input)
    x = Dense(20, activation='elu')(x)
    x = Dense(5, activation='softmax')(x)
    return x, input

def get_model():
    output = []
    input = []
    for i in range(len(label_cols)):
        out, inp = get_micro_model(train_input, i)
        output.append(out)
        input.append(inp)
    x = Dense(50, activation='elu')(concatenate(output))
    x = Dense(50, activation='elu')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(len(label_cols), activation='softmax')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def run_model(model):
    batch_size = 32
    epochs = 2
    file_path="weights_base.best.hdf5"
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    model.fit(train_input, label.values, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    model.load_weights(file_path)
    y_test = model.predict(test_input, verbose=1)
    return y_test

model_num = 5
for i in range(model_num):
    print(str(i) + ' run')
    if i == 0: pre = run_model(get_model())
    else: pre += run_model(get_model())

y_test = pre / model_num
print('finished')

0 run
Train on 143613 samples, validate on 15958 samples
Epoch 1/2
 19168/143613 [===>..........................] - ETA: 1:27 - loss: 0.2268 - acc: 0.9636

KeyboardInterrupt: 

In [None]:
label_cols_ini = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
file_name = 'baseline1.csv'

sample_submission = pd.read_csv(path + 'sample_submission.csv')

sample_submission[label_cols_ini] = y_test[:, : len(label_cols_ini)]

sample_submission.to_csv(path + file_name, index=False)

print('done')