In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# read data

path = '../data/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# train['other'] = 1 - train[label_cols].max(axis=1)
# label_cols.append('other')
print(label_cols)

train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

print(train.head())
print(train.shape[0])

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
         id                                       comment_text  toxic  \
0  22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1  27450690  "\n\n Please do not vandalize pages, as you di...      0   
2  54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3  77493077  Asking some his nationality is a Racial offenc...      0   
4  79357270  The reader here is not going by my say so for ...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
95851


In [9]:
# get tfidf

import re, string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

re_tok = re.compile(f'([{string.punctuation}¨«»®´·º½¾¿¡§£₤‘’\d+])')
def tokenize(s): return re_tok.sub(r' ', s).lower().split()

# CountVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0,\
#                 min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english', tokenizer=tokenize,\
                                   max_df=0.9, min_df=3, strip_accents='unicode', use_idf=1,\
                                   smooth_idf=1, sublinear_tf=1)

train_tfidf = tfidf_vectorizer.fit_transform(train['comment_text'])
tfidf_name = tfidf_vectorizer.get_feature_names()
test_tfidf = tfidf_vectorizer.transform(test['comment_text'])

train_tfidf

<95851x197832 sparse matrix of type '<class 'numpy.float64'>'
	with 3805393 stored elements in Compressed Sparse Row format>

In [37]:
def pr(y_i, y):
    p = train_tfidf[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_nn():
    input = Input(shape=(train_tfidf.shape[1], ), sparse=True)
    x = Dense(200, activation='tanh')(input)
    x = Dropout(dropout_rate)(x)
    x = Dense(200, activation='tanh')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(200, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(2, activation='softmax')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

file_path="weights_base.best.hdf5"

def f(x):
    if x == 0: return 1
    else: return 0

def get_model(y):
    print(y.columns.values)
    y['other'] = y[y.columns.values].apply(f)
    y = y.values
    r = np.log(pr(1, y[:, 0]) / pr(0, y[:, 0]))
#     m = LogisticRegression(C=4, dual=True)
    m = get_nn()
    x_nb = train_tfidf.multiply(r)
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    m.fit(x_nb.tocsr(), y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks_list)
    return m, r

preds = np.zeros((len(test), len(label_cols)))
train_preds = np.zeros((len(train), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m, r = get_model(train[j].to_frame())
    m.load_weights(file_path)
    train_preds[:, i] = m.predict(train_tfidf.multiply(r))[:, 1]
    preds[:, i] = m.predict_proba(test_tfidf.multiply(r))[:, 1]
    
print(preds.shape)

fit toxic
['toxic']
0        1
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       1
21       0
22       0
23       0
24       0
25       0
26       1
27       0
28       0
29       0
        ..
95821    0
95822    0
95823    0
95824    0
95825    0
95826    0
95827    0
95828    0
95829    0
95830    0
95831    0
95832    0
95833    0
95834    0
95835    0
95836    0
95837    0
95838    0
95839    0
95840    0
95841    0
95842    0
95843    0
95844    0
95845    1
95846    0
95847    1
95848    0
95849    0
95850    0
Name: toxic, Length: 95851, dtype: int64
one
0        1
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       1
21       0
22       0
23       0


ValueError: ('The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().', 'occurred at index toxic')

In [6]:
X_t = train_preds.tolist()
y = train[label_cols].values
X_test = preds.tolist()
print('done')

done


In [18]:
print(len(X_t))
print(len(X_t[0]))

95851
7


In [7]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint

unit_size = 200
dropout_rate = 0.2

def get_model():
    input = Input(shape=(len(X_t[0]), ))
    x = Dense(unit_size, activation='tanh')(input)
    x = Dropout(dropout_rate)(x)
    x = Dense(unit_size, activation='tanh')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(unit_size, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(len(label_cols), activation='softmax')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()

batch_size = 32
epochs = 3
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, earlystopping]
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)

y_test = model.predict(X_test)

print('done')

Using TensorFlow backend.


Train on 86265 samples, validate on 9586 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
done


In [25]:
print(y_test.shape)

(226998, 7)


In [41]:
label_cols_ini = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
file_name = 'baseline.csv'

sample_submission = pd.read_csv(path + 'sample_submission.csv')

sample_submission[label_cols_ini] = y_test #[:, : -1]

sample_submission.to_csv(path + file_name, index=False)

print('done')

done
