In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

model = ['nblogreg', 'nbnn', 'cnn', 'lstm']
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = '../../data/'

train_file = PATH + model[0] + '/train_' + model[0] + '.csv'
test_file = PATH + model[0] + '/' + model[0] + '.csv'
print(train_file)
print(test_file)

train = pd.read_csv(train_file)[label_cols]
test = pd.read_csv(test_file)[label_cols]

for i in range(1, len(model)):
    train_file = PATH + model[i] + '/train_' + model[i] + '.csv'
    test_file = PATH + model[i] + '/' + model[i] + '.csv'
    print(train_file)
    print(test_file)

    train = pd.concat([train, pd.read_csv(train_file)[label_cols]], axis=1)
    test = pd.concat([test, pd.read_csv(test_file)[label_cols]], axis=1)    

y = pd.read_csv(PATH + 'train.csv')[label_cols]

print('done')

../../data/nblogreg/train_nblogreg.csv
../../data/nblogreg/nblogreg.csv
../../data/nbnn/train_nbnn.csv
../../data/nbnn/nbnn.csv
../../data/cnn/train_cnn.csv
../../data/cnn/cnn.csv
../../data/lstm/train_lstm.csv
../../data/lstm/lstm.csv
done


In [4]:
def print_feature_importance(x, label, models):
    dict = {}
    for i in range(len(x)):
        which_model = models[i // len(label)]
        which_label = label[i % len(label)]
        dict[(which_model + ' ' + which_label)] = x[i]
    print(dict)

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

out = np.zeros((test.shape[0], len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit ' + j)
    ensemble = CatBoostClassifier(iterations=3,
                                  depth=10, 
                                  learning_rate=0.001, 
                                  loss_function='Logloss')
    ensemble.fit(X_train, y_train[j], use_best_model=True, eval_set=[X_test, y_test[j]])
    print_feature_importance(ensemble.get_feature_importance(X_train, y_train[j]), label_cols, model)
    out[:, i] = ensemble.predict_proba(test.values)[:, 1] # TODO: should ues 1 or 0?

print('done')

fit toxic
0:	learn: 0.6900042	test: 0.6900015	best: 0.6900015 (0)	total: 409ms	remaining: 818ms
1:	learn: 0.6870334	test: 0.6870299	best: 0.6870299 (1)	total: 855ms	remaining: 428ms
2:	learn: 0.6842185	test: 0.6842124	best: 0.6842124 (2)	total: 1.27s	remaining: 0us

bestTest = 0.6842124342
bestIteration = 2

Shrink model to first 3 iterations.
{'nblogreg toxic': 27.692309749799186, 'nblogreg severe_toxic': 2.9151213125746125, 'nblogreg obscene': 0.0, 'nblogreg threat': 1.5168054080837543, 'nblogreg insult': 0.0, 'nblogreg identity_hate': 0.0, 'nbnn toxic': 0.31070158834368655, 'nbnn severe_toxic': 3.1259900713758, 'nbnn obscene': 1.6315540182915158, 'nbnn threat': 0.0, 'nbnn insult': 0.0, 'nbnn identity_hate': 1.1173041802318364, 'cnn toxic': 0.0, 'cnn severe_toxic': 0.9327183746499648, 'cnn obscene': 13.773422199213412, 'cnn threat': 4.713490131061553, 'cnn insult': 1.6109989816193377, 'cnn identity_hate': 11.281767237427788, 'lstm toxic': 3.5394893384630706, 'lstm severe_toxic': 0.0,

In [5]:
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[label_cols] = out
submission.to_csv(PATH + 'ensemble/catboost_ensemble.csv', index=False)
print('done')

done
