In [1]:
import pandas as pd
import numpy as np
import re, string
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.callbacks import EarlyStopping, ModelCheckpoint

# read data

path = '../data/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['other'] = 1 - train[label_cols].max(axis=1)
label_cols.append('other')

print(label_cols)

train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

print(train.head())
print(train.shape[0])

Using TensorFlow backend.


['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'other']
         id                                       comment_text  toxic  \
0  22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1  27450690  "\n\n Please do not vandalize pages, as you di...      0   
2  54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3  77493077  Asking some his nationality is a Racial offenc...      0   
4  79357270  The reader here is not going by my say so for ...      0   

   severe_toxic  obscene  threat  insult  identity_hate  other  
0             0        0       0       0              0      0  
1             0        0       0       0              0      1  
2             0        0       0       0              0      1  
3             0        0       0       0              0      1  
4             0        0       0       0              0      1  
95851


In [2]:
C = train[label_cols].corr()
print(C)

                  toxic  severe_toxic   obscene    threat    insult  \
toxic          1.000000      0.308810  0.677491  0.162967  0.648330   
severe_toxic   0.308810      1.000000  0.404540  0.133469  0.377450   
obscene        0.677491      0.404540  1.000000  0.149874  0.744685   
threat         0.162967      0.133469  0.149874  1.000000  0.157534   
insult         0.648330      0.377450  0.744685  0.157534  1.000000   
identity_hate  0.259124      0.193385  0.287794  0.123971  0.331922   
other         -0.968241     -0.299002 -0.703519 -0.167516 -0.678137   

               identity_hate     other  
toxic               0.259124 -0.968241  
severe_toxic        0.193385 -0.299002  
obscene             0.287794 -0.703519  
threat              0.123971 -0.167516  
insult              0.331922 -0.678137  
identity_hate       1.000000 -0.274396  
other              -0.274396  1.000000  


In [46]:
# get tfidf

# re_tok = re.compile(f'([{string.punctuation}¨«»®´·º½¾¿¡§£₤‘’\d+])')
re_tok = re.compile(f'([{string.punctuation}¨«»®´·º½¾¿¡§£₤‘’\d+])')
def tokenize(s): return re_tok.sub(r' ', s).lower().split()

# CountVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0,\
#                 min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english', tokenizer=tokenize,\
                                   max_df=0.9, min_df=3, strip_accents='unicode', use_idf=1,\
                                   smooth_idf=1, sublinear_tf=1)

train_tfidf = tfidf_vectorizer.fit_transform(train['comment_text'])
tfidf_name = tfidf_vectorizer.get_feature_names()
test_tfidf = tfidf_vectorizer.transform(test['comment_text'])

print(train_tfidf.shape)
print(test_tfidf.shape)

(95851, 162619)
(226998, 162619)


In [44]:
def f(y_i, y):
    return (np.squeeze(np.array(train_tfidf[y == y_i].sum(0) + 1))) / ((y == y_i).sum() + 1)

r = []
for i, j in enumerate(label_cols):
    y = train[j].values
    r.append(np.log(f(1, y) / f(0, y)))
    
r = np.array(r)
print(r.shape)

print(train_tfidf.shape)
X_t = train_tfidf.multiply(np.transpose(r))
y = train[label_cols].values

X_test = test_tfidf.multiply(np.transpose(r))

(7, 55482)
(95851, 55482)


ValueError: inconsistent shapes

In [4]:
unit_size = 200
dropout_rate = 0.1

def get_model():
    input = Input(shape=(X_t.shape[1], ), sparse=True)
    x = Dense(unit_size, activation='tanh')(input)
    x = Dropout(dropout_rate)(x)
    x = Dense(unit_size, activation='tanh')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(len(label_cols), activation='softmax')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()

batch_size = 32
epochs = 2

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint, earlystopping]
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)

y_test = model.predict(X_test)

print('done')

Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
done


In [7]:
label_cols_ini = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
file_name = 'baseline1.csv'

sample_submission = pd.read_csv(path + 'sample_submission.csv')

sample_submission[label_cols_ini] = y_test[:, : -1]

sample_submission.to_csv(path + file_name, index=False)

print('done')

done
