In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

model = ['nblogreg', 'nbnn', 'cnn', 'lstm']
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = '../../data/'

train_file = PATH + model[0] + '/train_' + model[0] + '.csv'
test_file = PATH + model[0] + '/' + model[0] + '.csv'
print(train_file)
print(test_file)

train = pd.read_csv(train_file)[label_cols]
test = pd.read_csv(test_file)[label_cols]

for i in range(1, len(model)):
    train_file = PATH + model[i] + '/train_' + model[i] + '.csv'
    test_file = PATH + model[i] + '/' + model[i] + '.csv'
    print(train_file)
    print(test_file)

    train = pd.concat([train, pd.read_csv(train_file)[label_cols]], axis=1)
    test = pd.concat([test, pd.read_csv(test_file)[label_cols]], axis=1)    

y = pd.read_csv(PATH + 'train.csv')[label_cols]

print('done')

../../data/nblogreg/train_nblogreg.csv
../../data/nblogreg/nblogreg.csv
../../data/nbnn/train_nbnn.csv
../../data/nbnn/nbnn.csv
../../data/cnn/train_cnn.csv
../../data/cnn/cnn.csv
../../data/lstm/train_lstm.csv
../../data/lstm/lstm.csv
done


In [8]:
def print_feature_importance(x, label, models):
    dict = {}
    for i in range(len(x)):
        which_model = models[i // len(label)]
        which_label = label[i % len(label)]
        dict[(which_model + ' ' + which_label)] = x[i]
    print(dict)

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

out = np.zeros((test.shape[0], len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit ' + j)
    ensemble = CatBoostClassifier(iterations=2,
                                  depth=10, 
                                  learning_rate=1, 
                                  loss_function='Logloss')
    ensemble.fit(X_train, y_train[j], use_best_model=True, eval_set=[X_test, y_test[j]])
    print_feature_importance(ensemble.get_feature_importance(X_train, y_train[j]), label_cols, model)
    out[:, i] = ensemble.predict_proba(test.values)[:, 1] # TODO: should ues 1 or 0?

print('done')

fit toxic




0:	learn: 0.0785500	test: 0.0767258	best: 0.0767258 (0)	total: 391ms	remaining: 391ms
1:	learn: 0.0727782	test: 0.0729950	best: 0.0729950 (1)	total: 801ms	remaining: 0us

bestTest = 0.07299498012
bestIteration = 1

Shrink model to first 2 iterations.
{'nblogreg toxic': 0.0, 'nblogreg severe_toxic': 0.0, 'nblogreg obscene': 7.630087618029716, 'nblogreg threat': 0.0, 'nblogreg insult': 0.0, 'nblogreg identity_hate': 0.0, 'nbnn toxic': 7.47615096704684, 'nbnn severe_toxic': 2.7419533575533914, 'nbnn obscene': 3.0078189219644482, 'nbnn threat': 0.0, 'nbnn insult': 0.0, 'nbnn identity_hate': 0.0, 'cnn toxic': 0.6884832688745484, 'cnn severe_toxic': 5.911534142716549, 'cnn obscene': 0.6109550527214058, 'cnn threat': 26.685459116917848, 'cnn insult': 1.5106881016550826, 'cnn identity_hate': 0.0, 'lstm toxic': 0.9907910482597487, 'lstm severe_toxic': 0.5514297441555676, 'lstm obscene': 0.0, 'lstm threat': 0.0, 'lstm insult': 0.7806375203224167, 'lstm identity_hate': 41.41401113978243}
fit seve

KeyboardInterrupt: 

In [6]:
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[label_cols] = out
submission.to_csv(PATH + 'ensemble/ensemble.csv', index=False)
print('done')

done
