In [6]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

model = ['nblogreg', 'nbnn']
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = '../../data/'

train_file = PATH + model[0] + '/train_' + model[0] + '.csv'
test_file = PATH + model[0] + '/' + model[0] + '.csv'
print(train_file)
print(test_file)

train = pd.read_csv(train_file)[label_cols]
test = pd.read_csv(test_file)[label_cols]

for i in range(1, len(model)):
    train_file = PATH + model[i] + '/train_' + model[i] + '.csv'
    test_file = PATH + model[i] + '/' + model[i] + '.csv'
    print(train_file)
    print(test_file)

    train = pd.concat([train, pd.read_csv(train_file)[label_cols]], axis=1)
    test = pd.concat([test, pd.read_csv(test_file)[label_cols]], axis=1)    

y = pd.read_csv(PATH + 'train.csv')[label_cols]
    
print(train.head())

../../data/nblogreg/train_nblogreg.csv
../../data/nblogreg/nblogreg.csv
../../data/nbnn/train_nbnn.csv
../../data/nbnn/nbnn.csv
      toxic  severe_toxic   obscene    threat    insult  identity_hate  \
0  0.010744      0.001672  0.003580  0.000165  0.004071       0.000924   
1  0.002284      0.000995  0.001188  0.000067  0.001180       0.000235   
2  0.031033      0.000498  0.021042  0.000348  0.013775       0.000553   
3  0.000497      0.000100  0.000873  0.000029  0.000385       0.000162   
4  0.017553      0.001465  0.018992  0.000101  0.025217       0.000806   

      toxic  severe_toxic   obscene        threat    insult  identity_hate  
0  0.000473  1.535056e-05  0.000264  1.213208e-06  0.000155       0.000015  
1  0.001564  9.673443e-05  0.000259  3.343771e-05  0.000162       0.000121  
2  0.016867  1.114338e-05  0.011044  6.280925e-05  0.001071       0.000096  
3  0.000063  2.725394e-07  0.000059  5.595627e-07  0.000015       0.000006  
4  0.032954  2.827321e-03  0.013184  5.560

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

out = np.zeros((test.shape[0], len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit' + j)
    ensemble = CatBoostClassifier(iterations=2,
                                  depth=10, 
                                  learning_rate=1, 
                                  loss_function='Logloss')
    ensemble.fit(X_train, y_train[j], use_best_model=True, eval_set=[X_test, y_test[j]], verbose=1)
    out[:, i] = ensemble.predict_proba(test.values)[:, 1]

print('done')



0:	learn: 0.1035007	test: 0.0994751	best: 0.0994751 (0)	total: 307ms	remaining: 307ms
1:	learn: 0.0973920	test: 0.0949252	best: 0.0949252 (1)	total: 664ms	remaining: 0us

bestTest = 0.09492518297
bestIteration = 1

Shrink model to first 2 iterations.
0:	learn: 0.0187259	test: 0.0203716	best: 0.0203716 (0)	total: 285ms	remaining: 285ms
1:	learn: 0.0173660	test: 0.0194504	best: 0.0194504 (1)	total: 581ms	remaining: 0us

bestTest = 0.01945040417
bestIteration = 1

Shrink model to first 2 iterations.
0:	learn: 0.0558708	test: 0.0571707	best: 0.0571707 (0)	total: 399ms	remaining: 399ms
1:	learn: 0.0505905	test: 0.0511708	best: 0.0511708 (1)	total: 832ms	remaining: 0us

bestTest = 0.0511707841
bestIteration = 1

Shrink model to first 2 iterations.
0:	learn: 0.0085228	test: 0.0073064	best: 0.0073064 (0)	total: 283ms	remaining: 283ms
1:	learn: 0.0055553	test: 0.0046949	best: 0.0046949 (1)	total: 576ms	remaining: 0us

bestTest = 0.004694885543
bestIteration = 1

Shrink model to first 2 iteratio

In [17]:
print(out)

[[  9.88560720e-01   1.57462479e-01   9.32625625e-01   6.99716735e-04
    9.33993116e-01   5.74468101e-01]
 [  1.61921409e-04   4.31154792e-05   2.43151022e-03   3.84230353e-06
    2.11472232e-04   6.49072827e-06]
 [  2.81487512e-04   1.04967663e-03   8.58766571e-04   1.60476909e-05
    4.51090665e-04   3.89340254e-05]
 ..., 
 [  2.81487512e-04   2.79129932e-05   1.01411174e-03   3.77105049e-06
    1.91220564e-04   6.49072827e-06]
 [  1.26144748e-02   1.80522833e-04   3.06371249e-03   1.61474974e-04
    7.69804554e-03   6.07999586e-05]
 [  9.88560720e-01   1.52397478e-02   7.91709011e-01   1.06761357e-02
    2.22365160e-01   2.14228791e-03]]


In [18]:
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[label_cols] = out
submission.to_csv(PATH + 'ensemble/ensemble.csv', index=False)
print('done')