In [37]:
%matplotlib inline
import os

# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["KERAS_BACKEND"] = "tensorflow"
import pathlib
import pickle
import re
import string
import shutil

from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import scipy as sp

import sklearn as sk
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2, f_classif
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

from gensim.models import KeyedVectors

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, LSTM, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, Flatten, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import regularizers
from keras import optimizers
from keras import losses
from keras import metrics
from tqdm import tqdm, tqdm_notebook, tnrange, trange
import hyperas
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from hyperopt import Trials, STATUS_OK, tpe, STATUS_FAIL

In [31]:
from keras_tqdm import TQDMNotebookCallback

In [2]:
tqdm.monitor_interval = 0
tqdm_notebook().pandas()




In [137]:
cats = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
cat_col_i = {'toxic':0,'severe_toxic':1,'obscene':2,'threat':3,'insult':4,'identity_hate':5}

In [4]:
train = pd.read_pickle('CLEAN/train.P.gz', compression='gzip')

In [5]:
test = pd.read_pickle('CLEAN/test.P.gz', compression='gzip')

In [6]:
train.iloc[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comments
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,False,False,False,False,False,False,explanation \n why the edit make under my user...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,False,False,False,False,False,False,daww he match this background colour I be seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",False,False,False,False,False,False,hey man I be really not try to edit war its ju...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",False,False,False,False,False,False,more \n I can not make any real suggestion on ...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",False,False,False,False,False,False,you sir be my hero any chance you remember wha...


In [7]:
def evaluate_model(model, dataset):
    probs = model.predict_proba(x=X_trains[dataset], batch_size=256)
    print(classification_report(y_true=Y_train[dataset], y_pred=(probs >= 0.5)))
    return probs

In [8]:
def data():
    import gc
    gc.collect
    hyperas_cat = 'toxic'
    targdir = pathlib.Path("WORDBAG")
    with open(targdir/"X_trains.P", 'rb') as f:
        X_trains = pickle.load(f)
    with open(targdir/"Y_train.P", 'rb') as f:
        Y_train = pickle.load(f)
    x_train, x_test, y_train, y_test = train_test_split(X_trains[hyperas_cat], Y_train[hyperas_cat], 
                                          test_size=0.3,
                                          train_size=0.7, 
                                          stratify=Y_train[hyperas_cat])
    return x_train, y_train, x_test, y_test
    

In [9]:
def create_model(x_train, y_train, x_test, y_test):
    import gc
    gc.collect
    keras.backend.clear_session()
#     kern_reg_rate=0.0001
#     bias_reg_rate=0.0001
#     learning_rate = 0.0001
    class_weight = compute_class_weight('balanced', [False, True], y_train)
    model = Sequential()
    model.add(Dense(256, 
              activation='relu', 
              kernel_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
              bias_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
              input_dim=X_trains[hyperas_cat].shape[1]
             )
           )
    
    model.add(Dropout({{uniform(0.0,1.0/3.0)}}))
    model.add(Dense(256, 
              activation='relu', 
              kernel_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
              bias_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
             )
           )
    model.add(Dropout({{uniform(0.0,1.0/3.0)}}))
    model.add(Dense(128, 
              activation='relu',
              kernel_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
              bias_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
             )
           )
    model.add(Dropout({{uniform(0.0,1.0/3.0)}}))
    model.add(Dense(128, 
              activation='relu',
              kernel_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
              bias_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
             )
           )
    model.add(Dropout({{uniform(0.0,1.0/3.0)}}))
    model.add(Dense(1,activation='hard_sigmoid',
                  kernel_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
                  bias_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
                   ))
    model.compile(loss=losses.binary_crossentropy, optimizer=optimizers.Adam(lr={{uniform(0.0005,0.001)}}), metrics=[metrics.binary_accuracy])
    
#     print(model.summary())
    try:
        model.fit(x=x_train, y=y_train, 
                  batch_size={{choice([32,64,128,256])}}, 
                  epochs=1, 
                  class_weight=class_weight,
                  validation_data = (x_test, y_test),
                  verbose=0,
                  callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)]
               )
    except:
        print("Caught an exception while training model, marking as failed")
        e = sys.exc_info()[0]
        print( "Error: %s" % e )
        return {'loss': np.inf, 'status': STATUS_FAIL}
    finally:
        pass
    score, acc = model.evaluate(x_test, y_test, verbose=0)
    print("Test accuracy:", acc)
    return {'loss': -acc, 'status': STATUS_OK}

In [10]:
best_run = optim.minimize(model=create_model, 
                                      data=data, 
                                      algo=tpe.suggest, 
                                      max_evals=60,
                                      trials=Trials(),
                                      notebook_name='ToxicWikiCommentsKerasMLP'
                                     )
x_train, y_train, x_test, y_test = data()

>>> Imports:
#coding=utf-8

try:
    import os
except:
    pass

try:
    import pathlib
except:
    pass

try:
    import pickle
except:
    pass

try:
    import re
except:
    pass

try:
    import string
except:
    pass

try:
    import shutil
except:
    pass

try:
    from matplotlib import pyplot as plt
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import scipy as sp
except:
    pass

try:
    import sklearn as sk
except:
    pass

try:
    from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
except:
    pass

try:
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
except:
    pass

try:
    from sklearn.naive_bayes import GaussianNB
except:
    pass

try:
    from sklearn.linear_model import SGDClassifier
except:
    pass

try:
    from sklearn.metrics import classification_report
except:
    pass

try:
    from sklearn.naive_bayes i

Test accuracy: 0.9523938836898396
Train on 111699 samples, validate on 47872 samples
Epoch 1/1
 - 729s - loss: 0.3439 - binary_accuracy: 0.9428 - val_loss: 0.2677 - val_binary_accuracy: 0.9531
Test accuracy: 0.9530623328877005
Train on 111699 samples, validate on 47872 samples
Epoch 1/1
 - 564s - loss: 0.3837 - binary_accuracy: 0.9351 - val_loss: 0.3618 - val_binary_accuracy: 0.9515
Test accuracy: 0.9514538770053476
Train on 111699 samples, validate on 47872 samples
Epoch 1/1
 - 728s - loss: 0.3461 - binary_accuracy: 0.9423 - val_loss: 0.2719 - val_binary_accuracy: 0.9580
Test accuracy: 0.9579921457219251
Train on 111699 samples, validate on 47872 samples
Epoch 1/1
 - 728s - loss: 0.3561 - binary_accuracy: 0.9328 - val_loss: 0.2847 - val_binary_accuracy: 0.9568
Test accuracy: 0.9567805815508021
Train on 111699 samples, validate on 47872 samples
Epoch 1/1
 - 566s - loss: 0.3398 - binary_accuracy: 0.9411 - val_loss: 0.3236 - val_binary_accuracy: 0.9451
Test accuracy: 0.9451244986631016
T

In [11]:
# print("evaluation of best model:")
# print(best_model.evaluate(x_test, y_test))
print("best model hyper params:")
for param_name in best_run[0]:
    print("%s\t%s" % (param_name, best_run[0][param_name]))

best model hyper params:
Dropout	0.16886555405544676
Dropout_1	0.2793992939618896
Dropout_2	0.09045320380196045
Dropout_3	0.3190083272156171
batch_size	3
l2	0.00010138559834998214
l2_1	0.00021410123380801553
l2_2	0.00021129153971581485
l2_3	0.0002098318641179596
l2_4	0.00017332801538861371
l2_5	0.00033975261470378204
l2_6	0.000166878108440587
l2_7	0.00027661710454657295
l2_8	0.0004531484385885225
l2_9	0.00016227341161357368
lr	0.0009988382560284495


In [12]:
# seq = Sequential()
# seq.add(Dense(layer_neurons[0], 
#               activation={{choice(['tanh', 'relu', 'sigmoid'])}}, 
#               kernel_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
#               bias_regularizer=regularizers.l2({{uniform(0.0001,0.0005)}}),
#     #           activity_regularizer=regularizers.l1_l2(0.01),
#               input_dim=X_trains[hyperas_cat].shape[1]
#              )
#            )

In [21]:
os.makedirs("WORDBAG", exist_ok=True)
targdir = pathlib.Path("WORDBAG")
with open(targdir/"X_trains.P", 'rb') as f:
    X_trains = pickle.load(f)
with open(targdir/"Y_train.P", 'rb') as f:
    Y_train = pickle.load(f)
with open(targdir/"vectorizer_tfidf.P", 'rb') as f:
    tfidf_vect = pickle.load(f)
with open(targdir/"vectorizer_count.P", 'rb') as f:
    cvect = pickle.load(f)
with open(targdir/"selectors.P", 'rb') as f:
    selectors = pickle.load(f)
with open(targdir/"normalizer.P", 'rb') as f:
    norm = pickle.load(f)

In [39]:
def multi_label_f_classif_scores(X, y):
    selected_features = []
    for cat in cats:
        selector = SelectKBest(f_classif, k='all')
        selector.fit(X, y[:,cat_col_i[cat]])
        selected_features.append(list(selector.scores_))
    return selected_features

def multi_label_f_classif_mean(X, y):
    return np.mean(multi_label_f_classif_scores(X, y), axis=0)

def multi_label_f_classif_max(X, y):
    return np.max(multi_label_f_classif_scores(X, y), axis=0)

In [40]:
with open(targdir/"all_selected.P", 'rb') as f:
    all_selected=pickle.load(f)
with open(targdir/"all_selector.P", 'rb') as f:
    all_selector=pickle.load(f)

In [45]:
all_selected.shape

(159571, 200000)

In [14]:
# sample_weights = compute_sample_weight('balanced', Y_train['toxic'])
# pd.Series(sample_weights).unique()

In [48]:

def all_class_data():
    import gc
    gc.collect
    x_train, x_test, y_train, y_test = train_test_split(all_selected, Y_train, 
                                          test_size=0.3,
                                          train_size=0.7, 
                                          stratify=Y_train)
    return x_train, y_train, x_test, y_test
    

In [51]:
from sklearn.model_selection import StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(test_size=0.3, train_size=0.7)

In [54]:
train_indices, test_indices = splitter.split(all_selected, Y_train)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [57]:
Y_train.sum(axis='rows')

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [153]:
def best_model(best_run, input_dim):
    hyperas_cat = 'toxic'
    params = best_run[0]
    import gc
    gc.collect
    keras.backend.clear_session()
    model = Sequential()
    model.add(Dense(256, 
                    activation='relu', 
                    kernel_regularizer=regularizers.l2(params['l2']),
                    bias_regularizer=regularizers.l2(params['l2_1']),
                    input_dim=input_dim
             )
           )
    
    model.add(Dropout(params['Dropout']))
    model.add(Dense(256, 
              activation='relu', 
              kernel_regularizer=regularizers.l2(params['l2_2']),
              bias_regularizer=regularizers.l2(params['l2_3']),
             )
           )
    model.add(Dropout(params['Dropout_1']))
    model.add(Dense(128, 
              activation='relu',
              kernel_regularizer=regularizers.l2(params['l2_4']),
              bias_regularizer=regularizers.l2(params['l2_5']),
             )
           )
    model.add(Dropout(params['Dropout_2']))
    model.add(Dense(128, 
              activation='relu',
              kernel_regularizer=regularizers.l2(params['l2_6']),
              bias_regularizer=regularizers.l2(params['l2_7']),
             )
           )
    model.add(Dropout(params['Dropout_3']))
    model.add(Dense(1,activation='hard_sigmoid',
                  kernel_regularizer=regularizers.l2(params['l2_8']),
                  bias_regularizer=regularizers.l2(params['l2_9']),
                   ))
    model.compile(loss=losses.binary_crossentropy, optimizer=optimizers.Adam(lr=params['lr']), metrics=[metrics.binary_accuracy])
    return model

In [58]:
# x_train, y_train, x_test, y_test = all_class_data()

In [47]:
model = best_model(best_run, all_selected.shape[1])

In [60]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               51200256  
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
__________

In [91]:
def generate_clean_comment_column(datadf):
    return datadf.sum(axis='columns') == 0

In [92]:
Y_train['nontoxic'] = generate_clean_comment_column(Y_train)

In [93]:
sample_weight = compute_sample_weight('balanced', Y_train)

In [97]:
Y_train['nontoxic'].astype(int).sum()

143346

In [98]:
sample_weight[0] * 143346

1565.3782019373657

In [99]:
sample_weight[6]

31050.66887984273

In [129]:
def custom_compute_class_weights(y):
    cat_counts = y.sum(axis='rows')
    total_comments = y.shape[0]
    results = (total_comments / cat_counts).tolist()
    result_dict = dict()
    for i in range(len(results)):
        result_dict[i] = results[i]
    return result_dict
class_weight = custom_compute_class_weights(Y_train)

In [64]:
model.fit(x=all_selected, y=Y_train, 
          batch_size=256, 
          epochs=10, 
          class_weight=class_weight,
#           sample_weight=sample_weight,
#           validation_data = (x_test, y_test),
          verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)]
         )




<keras.callbacks.History at 0x1d54b52a080>

In [154]:
model.fit(x=all_selected, y=Y_train[cats], 
          batch_size=256, 
          epochs=10, 
          class_weight=class_weight,
#           sample_weight=sample_weight,
#           validation_data = (x_test, y_test),
          verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)]
         )

TypeError: Cannot interpret feed_dict key as Tensor: Tensor Tensor("dense_1_input:0", shape=(?, 200000), dtype=float32) is not an element of this graph.

In [None]:
model.fit(x=all_selected, y=Y_train[cats], 
          batch_size=256, 
          epochs=30, 
          class_weight=class_weight,
#           sample_weight=sample_weight,
#           validation_data = (x_test, y_test),
          verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)]
         )

In [30]:
# score, acc = model.evaluate(x_test, y_test, verbose=0)

In [131]:
all_sel_preds = model.predict_proba(all_selected, batch_size=256, verbose=0)

In [134]:
print(classification_report(Y_train[cats], all_sel_preds >= 0.5))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.78      0.62      0.69     15294
          1       0.53      0.26      0.35      1595
          2       0.66      0.66      0.66      8449
          3       0.00      0.00      0.00       478
          4       0.58      0.71      0.64      7877
          5       0.69      0.04      0.08      1405

avg / total       0.68      0.60      0.62     35098



In [235]:
from sklearn.feature_selection import SelectKBest
from keras.models import load_model
def train_model(cat_label, X_data, Y_data, epochs=5):
    skb = SelectKBest(score_func=f_classif, k=25000)
    sel = skb.fit_transform(X_data, Y_data[cat_label])
    model2 = best_model(best_run, sel.shape[1])
    class_weight = custom_compute_class_weights(Y_data)
    model2.fit(x=sel, y=Y_train[cat_label], 
          batch_size=256, 
          epochs=epochs, 
          class_weight=class_weight,
          verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)]
         )
    return model2, skb

def generate_train_save_models(X_data, Y_data, epochs_per_model=5):
    os.makedirs("NN_MODELS", exist_ok=True)
    for cat in cats:
        mdl, skb = train_model(cat, X_data, Y_data, epochs=epochs_per_model)
        mdl.save("NN_MODELS/%s.mlp.P" % cat)
        with open("NN_MODELS/%s.sel.P" % cat, 'wb') as f:
            pickle.dump(skb, f)

def generate_dataframe_preds(X_data):
    result_df = pd.DataFrame()
    for cat in cats:
        mdl = load_model("NN_MODELS/%s.mlp.P" % cat)
        with open("NN_MODELS/%s.sel.P" % cat, 'rb') as f:
            skb = pickle.load(f)
        sel = skb.transform(X_data)
        preds = mdl.predict_proba(sel, batch_size=256, verbose=0)
#         print("prediction matrix shape:", preds.flatten().shape)
        result_df[cat] = preds.flatten()
    return result_df

In [179]:
model2=train_model('toxic', all_selected, Y_train)




In [181]:
sel_prds = model2[0].predict_proba(sel, batch_size=256, verbose=0)

In [182]:
print(classification_report(Y_train['toxic'], sel_prds >= 0.5))

             precision    recall  f1-score   support

      False       0.91      0.32      0.47    144277
       True       0.10      0.70      0.17     15294

avg / total       0.83      0.36      0.45    159571



In [190]:
model2=train_model('threat', all_selected, Y_train, epochs=30)




In [191]:
sel_prds = model2[0].predict_proba(sel, batch_size=256, verbose=0)

In [192]:
print(classification_report(Y_train['threat'], sel_prds >= 0.5))

             precision    recall  f1-score   support

      False       1.00      0.99      1.00    159093
       True       0.21      0.85      0.34       478

avg / total       1.00      0.99      0.99    159571



In [193]:
generate_train_save_models(all_selected, Y_train, epochs_per_model=30)



















In [204]:
pred_df = generate_dataframe_preds(all_selected)

prediction matrix shape: (159571,)
prediction matrix shape: (159571,)
prediction matrix shape: (159571,)
prediction matrix shape: (159571,)
prediction matrix shape: (159571,)
prediction matrix shape: (159571,)


In [205]:
pred_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.000000,0.000000,0.000000,0.0,0.000000,0.0
1,0.000000,0.000000,0.000000,0.0,0.000000,0.0
2,0.000000,0.000000,0.000000,0.0,0.000000,0.0
3,0.000000,0.000000,0.000000,0.0,0.000000,0.0
4,0.031259,0.000000,0.210464,0.0,0.455941,0.0
5,0.000000,0.000000,0.000000,0.0,0.000000,0.0
6,1.000000,1.000000,1.000000,0.0,1.000000,0.0
7,0.000000,0.000000,0.000000,0.0,0.000000,0.0
8,0.000000,0.000000,0.000000,0.0,0.000000,0.0
9,0.102250,0.000000,0.000000,0.0,0.000000,0.0


In [239]:
print(classification_report(Y_train[cats], pred_df >= 0.777))

             precision    recall  f1-score   support

          0       0.95      0.97      0.96     15294
          1       0.89      0.93      0.91      1595
          2       0.88      0.97      0.92      8449
          3       0.92      0.81      0.86       478
          4       0.90      0.94      0.92      7877
          5       0.90      0.83      0.86      1405

avg / total       0.92      0.96      0.94     35098



In [238]:
from sklearn.metrics import f1_score
for i in range(7760, 7780):
    value = (0.0001 * i)
    print("using %f" % value, f1_score(Y_train[cats], pred_df >=value , average='weighted'))

using 0.776000 0.9354714378508827
using 0.776100 0.9354839684060942
using 0.776200 0.9355099787363652
using 0.776300 0.9355079075679716
using 0.776400 0.9354928956126544
using 0.776500 0.9354637140818665
using 0.776600 0.9354637140818665
using 0.776700 0.9354637140818665
using 0.776800 0.9354487002619416
using 0.776900 0.9354737710289358
using 0.777000 0.9354872501880869
using 0.777100 0.935472657976924
using 0.777200 0.935472657976924
using 0.777300 0.9354861379686117
using 0.777400 0.9354711222837321
using 0.777500 0.935483984080429
using 0.777600 0.935481928397015
using 0.777700 0.935481928397015
using 0.777800 0.9354669107129268
using 0.777900 0.9354669107129268


In [233]:
with open(targdir/"test_selected.P", 'rb') as f:
    test_selected=pickle.load(f)

In [234]:
test_selected.shape

(153164, 200000)

In [236]:
test_preds_df = generate_dataframe_preds(test_selected)

prediction matrix shape: (153164,)
prediction matrix shape: (153164,)
prediction matrix shape: (153164,)
prediction matrix shape: (153164,)
prediction matrix shape: (153164,)
prediction matrix shape: (153164,)


In [240]:
test_preds_df.to_csv("nn_preds.csv")

In [242]:
test_output=pd.DataFrame()

In [243]:
test_output['id'] = test['id']

In [249]:
test_output = pd.concat((test_output, test_preds_df), axis='columns')

In [251]:
test_output.to_csv("mlp_preds.csv", index=False)

In [253]:
adj_test_preds_df = test_preds_df * (0.5 / 0.777)

In [258]:
adj_test_output=pd.DataFrame()
adj_test_output['id'] = test['id']
adj_test_output = pd.concat((adj_test_output, adj_test_preds_df), axis='columns')

In [259]:
adj_test_output.to_csv("adj_mlp_preds.csv", index=False)

In [260]:
adj_test_output

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.643501,0.0,0.643501,0.0,0.643501,0.000000
1,0000247867823ef7,0.000000,0.0,0.000000,0.0,0.000000,0.000000
2,00013b17ad220c46,0.000000,0.0,0.000000,0.0,0.000000,0.000000
3,00017563c3f7919a,0.000000,0.0,0.000000,0.0,0.000000,0.000000
4,00017695ad8997eb,0.000000,0.0,0.000000,0.0,0.000000,0.000000
5,0001ea8717f6de06,0.000000,0.0,0.000000,0.0,0.000000,0.000000
6,00024115d4cbde0f,0.000000,0.0,0.000000,0.0,0.000000,0.000000
7,000247e83dcc1211,0.643501,0.0,0.141356,0.0,0.000000,0.000000
8,00025358d4737918,0.000000,0.0,0.000000,0.0,0.000000,0.000000
9,00026d1092fe71cc,0.000000,0.0,0.000000,0.0,0.000000,0.000000


In [261]:
submission = pd.read_csv("SUBMIT/submission.csv")

In [267]:
ens = (submission[cats] *2 + adj_test_output[cats]) / 3

In [269]:
ens_test_output=pd.DataFrame()
ens_test_output['id'] = test['id']
ens_test_output = pd.concat((ens_test_output, ens), axis='columns')

In [272]:
ens_test_output.to_csv("SUBMIT/ens.csv", index=False)