In [1]:
import pandas as pd
import numpy as np
import re, string
from keras.models import Model
from keras.layers import Dense, Input, Dropout, Embedding, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# read data

path = '../data/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# train['other'] = 1 - train[label_cols].max(axis=1)
# label_cols.append('other')

print(label_cols)

train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

print(train.head())
print(train.shape[0])

Using TensorFlow backend.


['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
         id                                       comment_text  toxic  \
0  22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1  27450690  "\n\n Please do not vandalize pages, as you di...      0   
2  54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3  77493077  Asking some his nationality is a Racial offenc...      0   
4  79357270  The reader here is not going by my say so for ...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
95851


In [16]:
# get tfidf

def tokenize(text):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(f'([{string.punctuation}¨«»®´·º½¾¿¡§£₤‘’\d+])', " ", text)
    return text.split()

# CountVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0,\
#                 min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,4), stop_words='english', tokenizer=tokenize,\
                                   max_df=0.9, strip_accents='unicode', use_idf=1,
                                   max_features=50000, binary=True)

train_tfidf_original = tfidf_vectorizer.fit_transform(train['comment_text'])
tfidf_name = tfidf_vectorizer.get_feature_names()
test_tfidf_original = tfidf_vectorizer.transform(test['comment_text'])
label = train[label_cols]

print(train_tfidf_original.shape)
print(test_tfidf_original.shape)

(95851, 50000)
(226998, 50000)


In [17]:
# Embedding

# num = 10000
# selector = SelectKBest(chi2, num)
# train_tfidf = selector.fit_transform(train_tfidf_original, label)
# test_tfidf = selector.transform(test_tfidf_original)

train_tfidf = train_tfidf_original
test_tfidf = test_tfidf_original
# label = label.iloc[ : 64]

print('done')

done


In [18]:
def pr(y_i, y):
    return (train_tfidf[y == y_i].sum(0) + 1) / ((y == y_i).sum() + 1)

train_input = []
y = []
test_input = []

for j in label_cols:
    print('set for ' + j)
    tmp_y = label[j].values
    y.append(tmp_y)
    r = np.log(pr(1, tmp_y) / pr(0, tmp_y))
    train_input.append(train_tfidf.multiply(r).tocsr())
    test_input.append(test_tfidf.multiply(r).tocsr())
    
print('done')

set for toxic
set for severe_toxic
set for obscene
set for threat
set for insult
set for identity_hate
done


In [20]:
dropout_rate = 0.2

def get_micro_model(x, i):
    input = Input(shape=(x[i].shape[1],), sparse=True)
    x = Dense(20, activation='elu')(input)
    x = Dense(20, activation='elu')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(5, activation='softmax')(x)
    return x, input

def get_model():
    output = []
    input = []
    for i in range(len(label_cols)):
        out, inp = get_micro_model(train_input, i)
        output.append(out)
        input.append(inp)
    x = Dense(200, activation='elu')(concatenate(output))
    x = Dense(200, activation='elu')(x)
    x = Dense(200, activation='elu')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(len(label_cols), activation='softmax')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def run_model(model):
    batch_size = 32
    epochs = 2
    file_path="weights_base.best.hdf5"
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    model.fit(train_input, label.values, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    model.load_weights(file_path)
    y_test = model.predict(test_input, verbose=1)
    return y_test

model_num = 5
for i in range(model_num):
    print(str(i) + ' run')
    if i == 0: pre = run_model(get_model())
    else: pre += run_model(get_model())

y_test = pre / model_num
print('finished')

0 run
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
 4544/86265 [>.............................] - ETA: 4:04 - loss: 0.2334 - acc: 0.9609

KeyboardInterrupt: 

In [None]:
label_cols_ini = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
file_name = 'baseline1.csv'

sample_submission = pd.read_csv(path + 'sample_submission.csv')

sample_submission[label_cols_ini] = y_test[:, : len(label_cols_ini)]

sample_submission.to_csv(path + file_name, index=False)

print('done')