In which I use extremely randomized trees. See here:
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

Also see: 
[GEW2006] P. Geurts, D. Ernst., and L. Wehenkel, “Extremely randomized trees”, Machine Learning, 63(1), 3-42, 2006.

In [7]:
#Import libraries:
import numpy as np
import pandas as pd
import xgboost as xgb
import time
#load data:
# train = pd.read_csv("train.csv")
# target = train['target']
# #drop targets & (unique row) IDs from training data
# train = train.drop(['ID','target'],axis=1)
# test = pd.read_csv("test.csv")
# IDs = test['ID'].values
# test = test.drop(['ID'],axis=1)

# PREPROCESSING

In [None]:
#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code
train = pd.read_csv("train.csv")
target = train['target']
train = train.drop(['ID','target'],axis=1)
test = pd.read_csv("test.csv")
ids = test['ID'].values
test = test.drop(['ID'],axis=1)
#

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()  #TODO

In [None]:
# https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19133/feature-engineering-for-beginners
#check this out: https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19240/analysis-of-duplicate-variables-correlated-variables-large-post

A function to report best hyperparameters:
http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html

In [None]:
from operator import itemgetter
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

# TESTING EXTRATREES CLASSIFIER APPROACH

In [None]:
#https://www.kaggle.com/yuhaichina/bnp-paribas-cardif-claims-management/extratreesclassifier-score-0-45911
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

t0 = time.time()
X_train = train
X_test = test
extc = ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 5,
                            max_depth= 50, min_samples_leaf= 5)      

extc.fit(X_train,target) 
preds = extc.predict_proba(X_test)[:,1]
t1 = time.time()
total_time = t1 - t0
print total_time

In [None]:
import csv
predictions_file = open("extc.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(IDs, preds[:,1]))
predictions_file.close()

# RandomizedSearchCV on ETC parameter space

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import metrics

t0 = time.time()
X_train = train
X_test = test

param_grid = {'max_depth': range(20,50),
                      'n_estimators': np.arange(500,1000,100),
#                         'n_estimators': [10],
                      'max_features' : np.arange(10,110,10),
                      'min_samples_split' : np.arange(1,6,1),
                        'min_samples_leaf' : np.arange(1,6,1),
                      'criterion' : ['entropy'],
                      #'scale_pos_weight': [0.5, 1]
                      #'model__eta':[0.01,0.02],
                     #'model__scale_pos_weight':[0.8,1.0]
                      #'model__silent':[1],
                      }



extc = ExtraTreesClassifier()

n_iter_search=50
random_search = RandomizedSearchCV(extc, param_distributions=param_grid,
                                   n_iter=n_iter_search, scoring ="log_loss")

random_search.fit( train , target)

t1 = time.time()
total_time = t1 - t0
print total_time

In [None]:
print report(random_search.grid_scores_)

In [None]:
params = {'min_samples_leaf': 2, 
         'n_estimators': 800, 
         'max_features': 50, 
         'criterion': 'entropy', 
         'min_samples_split': 2, 
         'max_depth': 29}
t0 = time.time()
extc2 = ExtraTreesClassifier(**params)
#OR
#extc2 = ExtraTreesClassifier(**random_search.best_params_)
extc2.fit(X_train,target) 
preds_2 = extc2.predict_proba(X_test)[:,1]
t1 = time.time()
total_time = t1 - t0
print total_time

In [None]:
import csv
predictions_file = open("extc2_rcv.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(ids, preds))
predictions_file.close()

This gave -logloss = 0.45450.

In [None]:
feat_import = extc.feature_importances_

In [None]:
print np.shape(feat_import)
print feat_import

# DO THE SAME NOW, INCLUDING A BIT MORE PREPROCESSING

Variation on extc benchmark. See here: 
https://www.kaggle.com/mujtabaasif/bnp-paribas-cardif-claims-management/extratrees

In [None]:
#this is the benchmark:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble


print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -999 
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -999

X_train = train
X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      

extc.fit(X_train,target) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees.csv',index=False)

In [None]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Clearing...')

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()
            
X_train = train
X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      

extc.fit(X_train,target) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees_impute.csv',index=False)

OK now, using the preprocessing above, let's use RandomizedSearchCV!

In [None]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

import time
from sklearn.grid_search import RandomizedSearchCV
from sklearn import metrics

t0 = time.time()

print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Cleaning...')

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()
X_train = train
X_test = test


print('Training...')

param_grid = {'n_estimators': [850],
#                         'n_estimators': [10],
                      'max_features' : np.arange(50,90,5),
                      'min_samples_split' : np.arange(2,5,1),
                        'min_samples_leaf' : np.arange(1,4,1),
                      'criterion' : ['entropy'],
                      #'scale_pos_weight': [0.5, 1]
                      #'model__eta':[0.01,0.02],
                     #'model__scale_pos_weight':[0.8,1.0]
                      #'model__silent':[1],
                      }



extc = ExtraTreesClassifier(verbose = 10 )

n_iter_search= 15
random_search = RandomizedSearchCV(extc, param_distributions=param_grid,
                                   n_iter=n_iter_search, scoring ="log_loss" , n_jobs = -2,
                                  verbose = 10)

random_search.fit( X_train , target)

t1 = time.time()
total_time = t1 - t0
print total_time

In [3]:
from operator import itemgetter
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [2]:
print report(random_search.grid_scores_)

NameError: name 'report' is not defined

In [None]:
#https://www.kaggle.com/scirpus/bnp-paribas-cardif-claims-management/benouilli-naive-bayes/code
#https://www.kaggle.com/chabir/bnp-paribas-cardif-claims-management/extratreesclassifier-score-0-45-v5/discussion
random_search.best_params_

In [None]:
t0 = time.time()
# extc2 = ExtraTreesClassifier(**params)
#OR
extc2pp = ExtraTreesClassifier(**random_search.best_params_)
extc2pp.fit(X_train,target) 
preds_2 = extc2pp.predict_proba(X_test)[:,1]
t1 = time.time()
total_time = t1 - t0
print total_time


In [None]:
import csv
predictions_file = open("extc_pp.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(ids, preds_2))
predictions_file.close()

this was not best submission:  logloss = - 0.45441. Probably because max_depth = None

In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

import time
from sklearn.grid_search import RandomizedSearchCV
from sklearn import metrics

t0 = time.time()

print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Cleaning...')

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()
X_train = train
X_test = test


print('Training...')

param_grid = {      'max_depth' : range(40,51) ,
                    'n_estimators': [500],
#                         'n_estimators': [10],
                      'max_features' : np.arange(50,90,5),
                      'min_samples_split' : np.arange(2,5,1),
                        'min_samples_leaf' : np.arange(1,4,1),
                      'criterion' : ['entropy'],
                      #'scale_pos_weight': [0.5, 1]
                      #'model__eta':[0.01,0.02],
                     #'model__scale_pos_weight':[0.8,1.0]
                      #'model__silent':[1],
                      }



extc = ExtraTreesClassifier(verbose = 10 )

n_iter_search= 15
random_search = RandomizedSearchCV(extc, param_distributions=param_grid,
                                   n_iter=n_iter_search, scoring ="log_loss" , n_jobs = -2,
                                  verbose = 10)

random_search.fit( X_train , target)

t1 = time.time()
total_time = t1 - t0
print total_time

Load data...
Cleaning...
Training...
Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Done   2 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-2)]: Done   7 tasks      | elapsed: 46.3min
[Parallel(n_jobs=-2)]: Done  12 tasks      | elapsed: 62.2min
[Parallel(n_jobs=-2)]: Done  19 tasks      | elapsed: 96.0min
[Parallel(n_jobs=-2)]: Done  26 tasks      | elapsed: 128.2min
[Parallel(n_jobs=-2)]: Done  35 tasks      | elapsed: 172.2min
[Parallel(n_jobs=-2)]: Done  46 out of  45 | elapsed: 205.2min remaining:  -267.7s
[Parallel(n_jobs=-2)]: Done  46 out of  45 | elapsed: 205.3min remaining:  -267.7s
[Parallel(n_jobs=-2)]: Done  46 out of  45 | elapsed: 213.7min remaining:  -278.8s
[Parallel(n_jobs=-2)]: Done  46 out of  45 | elapsed: 215.7min remaining:  -281.3s
[Parallel(n_jobs=-2)]: Done  46 out of  45 | elapsed: 215.7min remaining:  -281.3s
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed: 215.7min finished
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    2.6s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    2.7s
[Parallel(n_job

building tree 1 of 500
building tree 2 of 500[CV] min_samples_leaf=3, n_estimators=500, min_samples_split=2, criterion=entropy, max_features=80, max_depth=49 
[CV] min_samples_leaf=3, n_estimators=500, min_samples_split=2, criterion=entropy, max_features=80, max_depth=49 
[CV] min_samples_leaf=3, n_estimators=500, min_samples_split=2, criterion=entropy, max_features=80, max_depth=49 
building tree 1 of 500building tree 1 of 500building tree 1 of 500


building tree 2 of 500building tree 2 of 500building tree 2 of 500


building tree 3 of 500building tree 3 of 500building tree 3 of 500


building tree 4 of 500building tree 4 of 500building tree 4 of 500


building tree 5 of 500building tree 5 of 500building tree 5 of 500


building tree 6 of 500building tree 6 of 500building tree 6 of 500


building tree 7 of 500building tree 7 of 500building tree 7 of 500


building tree 8 of 500building tree 8 of 500building tree 8 of 500


building tree 9 of 500building tree 9 of 500building tree 9 o

[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 312 tasks       | elapsed:  8.2min
[Parallel(n_jobs=1)]: Done 312 tasks       | elapsed:  8.3min
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed:  9.2min
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed:  9.0min
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed:  9.2min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed:  9.9min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed:  9.7min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed:  9.9min
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed: 10.6min
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed: 10.4min
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed: 10.6min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed: 11.4min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed: 11.2min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed: 11.3min
[Paralle


building tree 5 of 500


building tree 129 of 500building tree 129 of 500building tree 129 of 500


building tree 130 of 500building tree 130 of 500building tree 130 of 500


building tree 131 of 500building tree 131 of 500building tree 131 of 500


building tree 132 of 500building tree 132 of 500building tree 132 of 500


building tree 133 of 500building tree 133 of 500building tree 133 of 500


building tree 134 of 500building tree 134 of 500building tree 134 of 500


building tree 135 of 500building tree 135 of 500building tree 135 of 500


building tree 136 of 500building tree 136 of 500building tree 136 of 500


building tree 137 of 500building tree 137 of 500building tree 137 of 500


building tree 138 of 500building tree 138 of 500building tree 138 of 500


building tree 139 of 500building tree 139 of 500building tree 139 of 500


building tree 140 of 500building tree 140 of 500building tree 140 of 500


building tree 141 of 500building tree 141 of 500building tree 141 of 500



[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:    8.8s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:   15.2s
[Parallel(n_jobs=1)]: Done 144 tasks       | elapsed:    3.2s
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:    3.7s
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:    3.7s
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:    3.6s
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:    4.1s
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:    4.1s
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:    4.0s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:    4.9s
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:    5.0s
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:    4.9s
[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed:    5.4s
[Paralle


building tree 13 of 500
building tree 470 of 500building tree 470 of 500building tree 470 of 500


building tree 471 of 500building tree 471 of 500building tree 471 of 500


building tree 472 of 500building tree 472 of 500building tree 472 of 500


building tree 473 of 500building tree 473 of 500building tree 473 of 500


building tree 474 of 500building tree 474 of 500building tree 474 of 500


building tree 475 of 500building tree 475 of 500building tree 475 of 500


building tree 476 of 500building tree 476 of 500building tree 476 of 500


building tree 477 of 500building tree 477 of 500building tree 477 of 500


building tree 478 of 500building tree 478 of 500building tree 478 of 500


building tree 479 of 500building tree 479 of 500building tree 479 of 500


building tree 480 of 500building tree 480 of 500building tree 480 of 500


building tree 481 of 500building tree 481 of 500building tree 481 of 500


building tree 482 of 500building tree 482 of 500building tree 482 of 500




[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:   21.6s
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:   30.4s
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed:  2.5min
[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed:  2.5min
[Paralle


building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:   39.2s
[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed:   50.6s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:    0.4s
[Paralle


building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500
building tree 49 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 58 of 500
building tree 59 of 500
building tree 60 of 500

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done 312 tasks       | elapsed:    6.7s
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed:    7.3s
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed:    7.5s
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed:    7.3s
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed:    7.8s
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed:    8.1s
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed:    7.9s
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed:    8.5s
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed:    8.7s
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed:    8.5s
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed:    9.1s
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed:    9.3s
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed:    9.0s
[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed:    9.7s
[Paralle


building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65 of 500
building tree 66 of 500
building tree 67 of 500
building tree 68 of 500
building tree 69 of 500
building tree 70 of 500
building tree 71 of 500
building tree 72 of 500
building tree 73 of 500
building tree 74 of 500
building tree 75 of 500
building tree 76 of 500
building tree 77 of 500
building tree 78 of 500
building tree 79 of 500
building tree 80 of 500
building tree 81 of 500
building tree 82 of 500
building tree 83 of 500
building tree 84 of 500

[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:  4.7min
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:  4.3min
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:  4.7min
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:  5.2min
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:  4.9min
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:  5.2min
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:  5.7min
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:  5.5min
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:  5.7min
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:  6.3min
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:  6.1min
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:  6.3min
[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed:  6.9min
[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed:  6.6min
[Paralle


building tree 85 of 500
building tree 86 of 500
building tree 87 of 500
building tree 88 of 500
building tree 89 of 500
building tree 90 of 500
building tree 91 of 500
building tree 92 of 500
building tree 93 of 500
building tree 94 of 500
building tree 95 of 500
building tree 96 of 500
building tree 97 of 500
building tree 98 of 500
building tree 99 of 500
building tree 100 of 500
building tree 101 of 500
building tree 102 of 500
building tree 103 of 500
building tree 104 of 500
building tree 105 of 500
building tree 106 of 500
building tree 107 of 500
building tree 108 of 500
building tree 109 of 500
building tree 110 of 500
building tree 111 of 500
building tree 112 of 500

[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed:  2.0min
[Parallel(n_jobs=1)]: Done 112 tasks       | elapsed:  2.4min
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:    2.0s
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:    1.8s
[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed:    2.2s
[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed:    2.2s
[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed:    2.0s
[Paralle


building tree 113 of 500
building tree 114 of 500
building tree 115 of 500
building tree 116 of 500
building tree 117 of 500
building tree 118 of 500
building tree 119 of 500
building tree 120 of 500
building tree 121 of 500
building tree 122 of 500
building tree 123 of 500
building tree 124 of 500
building tree 125 of 500
building tree 126 of 500
building tree 127 of 500
building tree 128 of 500
building tree 129 of 500
building tree 130 of 500
building tree 131 of 500
building tree 132 of 500
building tree 133 of 500
building tree 134 of 500
building tree 135 of 500
building tree 136 of 500
building tree 137 of 500
building tree 138 of 500
building tree 139 of 500
building tree 140 of 500
building tree 141 of 500
building tree 142 of 500
building tree 143 of 500
building tree 144 of 500

[Parallel(n_jobs=1)]: Done 127 tasks       | elapsed:  2.7min
[Parallel(n_jobs=1)]: Done 144 tasks       | elapsed:  3.0min
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    6.8s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    6.5s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    6.8s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:   12.1s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:   11.6s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:   12.0s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:   20.8s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:   19.7s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:   20.6s
[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:   29.5s
[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:   28.1s
[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:   28.9s
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:   41.0s
[Paralle


building tree 145 of 500
building tree 146 of 500
building tree 147 of 500
building tree 148 of 500
building tree 149 of 500
building tree 150 of 500
building tree 151 of 500
building tree 152 of 500
building tree 153 of 500
building tree 154 of 500
building tree 155 of 500
building tree 156 of 500
building tree 157 of 500
building tree 158 of 500
building tree 159 of 500
building tree 160 of 500
building tree 161 of 500
building tree 162 of 500
building tree 163 of 500
building tree 164 of 500
building tree 165 of 500
building tree 166 of 500
building tree 167 of 500
building tree 168 of 500
building tree 169 of 500
building tree 170 of 500
building tree 171 of 500
building tree 172 of 500
building tree 173 of 500
building tree 174 of 500
building tree 175 of 500
building tree 176 of 500
building tree 177 of 500
building tree 178 of 500
building tree 179 of 500
building tree 180 of 500

[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:  3.4min
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:  3.8min
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed: 12.4min
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed: 11.6min
[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed: 12.4min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed: 13.3min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed: 12.7min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed: 13.4min
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed: 14.3min
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed: 13.7min
[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed: 14.4min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed: 15.4min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed: 14.8min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed: 15.5min
[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed: 16.4min
[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed: 15.8min
[Paralle


building tree 181 of 500
building tree 182 of 500
building tree 183 of 500
building tree 184 of 500
building tree 185 of 500
building tree 186 of 500
building tree 187 of 500
building tree 188 of 500
building tree 189 of 500
building tree 190 of 500
building tree 191 of 500
building tree 192 of 500
building tree 193 of 500
building tree 194 of 500
building tree 195 of 500
building tree 196 of 500
building tree 197 of 500
building tree 198 of 500
building tree 199 of 500
building tree 200 of 500
building tree 201 of 500
building tree 202 of 500
building tree 203 of 500
building tree 204 of 500
building tree 205 of 500
building tree 206 of 500
building tree 207 of 500
building tree 208 of 500
building tree 209 of 500
building tree 210 of 500
building tree 211 of 500
building tree 212 of 500
building tree 213 of 500
building tree 214 of 500
building tree 215 of 500
building tree 216 of 500
building tree 217 of 500
building tree 218 of 500
building tree 219 of 500
building tree 220 of 500

[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:  4.2min
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:  4.6min
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:    3.7s
[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed:    3.7s
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:    4.1s
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:    4.1s
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed:    4.2s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed:    4.6s
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:    5.0s
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:    5.0s
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed:    5.5s
[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed:    5.5s
[Paralle


building tree 221 of 500
building tree 222 of 500
building tree 223 of 500
building tree 224 of 500
building tree 225 of 500
building tree 226 of 500
building tree 227 of 500
building tree 228 of 500
building tree 229 of 500
building tree 230 of 500
building tree 231 of 500
building tree 232 of 500
building tree 233 of 500
building tree 234 of 500
building tree 235 of 500
building tree 236 of 500
building tree 237 of 500
building tree 238 of 500
building tree 239 of 500
building tree 240 of 500
building tree 241 of 500
building tree 242 of 500
building tree 243 of 500
building tree 244 of 500
building tree 245 of 500
building tree 246 of 500
building tree 247 of 500
building tree 248 of 500
building tree 249 of 500
building tree 250 of 500
building tree 251 of 500
building tree 252 of 500
building tree 253 of 500
building tree 254 of 500
building tree 255 of 500
building tree 256 of 500
building tree 257 of 500
building tree 258 of 500
building tree 259 of 500
building tree 260 of 500

[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed:  5.1min
[Parallel(n_jobs=1)]: Done 264 tasks       | elapsed:  5.5min



building tree 265 of 500
building tree 266 of 500
building tree 267 of 500
building tree 268 of 500
building tree 269 of 500
building tree 270 of 500
building tree 271 of 500
building tree 272 of 500
building tree 273 of 500
building tree 274 of 500
building tree 275 of 500
building tree 276 of 500
building tree 277 of 500
building tree 278 of 500
building tree 279 of 500
building tree 280 of 500
building tree 281 of 500
building tree 282 of 500
building tree 283 of 500
building tree 284 of 500
building tree 285 of 500
building tree 286 of 500
building tree 287 of 500
building tree 288 of 500
building tree 289 of 500
building tree 290 of 500
building tree 291 of 500
building tree 292 of 500
building tree 293 of 500
building tree 294 of 500
building tree 295 of 500
building tree 296 of 500
building tree 297 of 500
building tree 298 of 500
building tree 299 of 500
building tree 300 of 500
building tree 301 of 500
building tree 302 of 500
building tree 303 of 500
building tree 304 of 500

[Parallel(n_jobs=1)]: Done 287 tasks       | elapsed:  6.0min
[Parallel(n_jobs=1)]: Done 312 tasks       | elapsed:  6.6min



building tree 313 of 500
building tree 314 of 500
building tree 315 of 500
building tree 316 of 500
building tree 317 of 500
building tree 318 of 500
building tree 319 of 500
building tree 320 of 500
building tree 321 of 500
building tree 322 of 500
building tree 323 of 500
building tree 324 of 500
building tree 325 of 500
building tree 326 of 500
building tree 327 of 500
building tree 328 of 500
building tree 329 of 500
building tree 330 of 500
building tree 331 of 500
building tree 332 of 500
building tree 333 of 500
building tree 334 of 500
building tree 335 of 500
building tree 336 of 500
building tree 337 of 500
building tree 338 of 500
building tree 339 of 500
building tree 340 of 500
building tree 341 of 500
building tree 342 of 500
building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500

[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed:  7.1min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed:  7.6min



building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500
building tree 382 of 500
building tree 383 of 500
building tree 384 of 500
building tree 385 of 500
building tree 386 of 500
building tree 387 of 500
building tree 388 of 500
building tree 389 of 500
building tree 390 of 500
building tree 391 of 500
building tree 392 of 500
building tree 393 of 500
building tree 394 of 500
building tree 395 of 500
building tree 396 of 500
building tree 397 of 500
building tree 398 of 500
building tree 399 of 500
building tree 400 of 500
building tree 401 of 500
building tree 402 of 500
building tree 403 of 500
building tree 404 of 500

[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed:  8.2min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed:  8.8min



building tree 421 of 500
building tree 422 of 500
building tree 423 of 500
building tree 424 of 500
building tree 425 of 500
building tree 426 of 500
building tree 427 of 500
building tree 428 of 500
building tree 429 of 500
building tree 430 of 500
building tree 431 of 500
building tree 432 of 500
building tree 433 of 500
building tree 434 of 500
building tree 435 of 500
building tree 436 of 500
building tree 437 of 500
building tree 438 of 500
building tree 439 of 500
building tree 440 of 500
building tree 441 of 500
building tree 442 of 500
building tree 443 of 500
building tree 444 of 500
building tree 445 of 500
building tree 446 of 500
building tree 447 of 500
building tree 448 of 500
building tree 449 of 500
building tree 450 of 500
building tree 451 of 500
building tree 452 of 500
building tree 453 of 500
building tree 454 of 500
building tree 455 of 500
building tree 456 of 500
building tree 457 of 500
building tree 458 of 500
building tree 459 of 500
building tree 460 of 500

[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed:  9.4min
[Parallel(n_jobs=1)]: Done 480 tasks       | elapsed: 10.1min



building tree 481 of 500
building tree 482 of 500
building tree 483 of 500
building tree 484 of 500
building tree 485 of 500
building tree 486 of 500
building tree 487 of 500
building tree 488 of 500
building tree 489 of 500
building tree 490 of 500
building tree 491 of 500
building tree 492 of 500
building tree 493 of 500
building tree 494 of 500
building tree 495 of 500
building tree 496 of 500
building tree 497 of 500
building tree 498 of 500
building tree 499 of 500
building tree 500 of 500
13582.8807368


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 10.5min finished


In [4]:
print report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: -0.461 (std: 0.001)
Parameters: {'min_samples_leaf': 3, 'n_estimators': 500, 'max_features': 50, 'criterion': 'entropy', 'min_samples_split': 4, 'max_depth': 49}

Model with rank: 2
Mean validation score: -0.461 (std: 0.001)
Parameters: {'min_samples_leaf': 3, 'n_estimators': 500, 'max_features': 65, 'criterion': 'entropy', 'min_samples_split': 2, 'max_depth': 48}

Model with rank: 3
Mean validation score: -0.461 (std: 0.001)
Parameters: {'min_samples_leaf': 3, 'n_estimators': 500, 'max_features': 70, 'criterion': 'entropy', 'min_samples_split': 3, 'max_depth': 44}

None


In [9]:
t0 = time.time()
extc = ExtraTreesClassifier(**random_search.best_params_)
extc.fit(X_train,target) 
preds = extc.predict_proba(X_test)[:,1]
t1 = time.time()
total_time = t1 - t0
print total_time

import csv
predictions_file = open("extc.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(IDs, preds))
predictions_file.close()

732.851265907


# Attempting benchmark

In [7]:
#this one here:
#https://www.kaggle.com/kishoreb4/bnp-paribas-cardif-claims-management/extratrees/notebook

import pandas as pd

import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble


print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -999 
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -999

X_train = train
X_test = test
print('Training...')
#I'VE increased # of estimators from 850
extc = ExtraTreesClassifier(n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      

extc.fit(X_train,target) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees_bm.csv',index=False)

Load data...
Clearing...
Training...
Predict...


# YOUR CLASSIFIER ABOVE WITH PREPROCESSING DROPS FROM BENCHMARK

In [6]:
#this one here:
#https://www.kaggle.com/kishoreb4/bnp-paribas-cardif-claims-management/extratrees/notebook

import pandas as pd

import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble


print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()

X_train = train
X_test = test
print('Training...')

params = {'min_samples_leaf': 2, 
          'n_estimators': 800, 
          'max_features': 50, 'criterion': 'entropy', 
          'min_samples_split': 2, 'max_depth': 29, 'n_jobs' : -1}

extc = ExtraTreesClassifier(**params )      

extc.fit(X_train,target) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees_bm_imp.csv',index=False)

Load data...
Clearing...
Training...
Predict...


# ANOTHER BENCHMARK!

In [9]:
#https://www.kaggle.com/chabir/bnp-paribas-cardif-claims-management/extratreesclassifier-score-0-45-v5/code

import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB


def Binarize(columnName, df, features=None):
    df[columnName] = df[columnName].astype(str)
    if(features is None):
        features = np.unique(df[columnName].values)
    print(features)
    for x in features:
        df[columnName+'_' + x] = df[columnName].map(lambda y:
                                                    1 if y == x else 0)
    df.drop(columnName, inplace=True, axis=1)
    return df, features


def MungeData(train, test):

    features = train.columns[2:]
    print(type(features))
    for col in features:
        if((train[col].dtype == 'object') and (col!="v22")):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)
            train[col] = train[col].astype(float)
            test[col] = test[col].astype(float)
    return train, test


print('Load data...')
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


train, test = MungeData(train, test)





target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)





print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -999 
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -999

X_train = train
X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=750,max_features= 60,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      

extc.fit(X_train,target) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees_bm_again.csv',index=False)

Load data...
<class 'pandas.core.index.Index'>
v3
['A' 'B' 'C' 'nan']
['A' 'B' 'C' 'nan']
v24
['A' 'B' 'C' 'D' 'E']
['A' 'B' 'C' 'D' 'E']
v30
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'nan']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'nan']
v31
['A' 'B' 'C' 'nan']
['A' 'B' 'C' 'nan']
v47
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J']
v52
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'nan']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'nan']
v56
['A' 'AA' 'AB' 'AC' 'AE' 'AF' 'AG' 'AH' 'AI' 'AJ' 'AK' 'AL' 'AM' 'AN' 'AO'
 'AP' 'AR' 'AS' 'AT' 'AU' 'AV' 'AW' 'AX' 'AY' 'AZ' 'B' 'BA' 'BC' 'BD' 'BE'
 'BF' 'BG' 'BH' 'BI' 'BJ' 'BK' 'BL' 'BM' 'BN' 'BO' 'BP' 'BQ' 'BR' 'BS' 'BT'
 'BU' 'BV' 'BW' 'BX' 'BY' 'BZ' 'C' 'CA' 'CB' 'CC' 'CD' 'CE' 'CF' 'CG' 'CH'
 'CI' 'CJ' 'CK' 'CL' 'CM' 'CN' 'CO' 'CP' 'CQ' 'CS' 'CT' 'CV' 'CW' 'CX' 'CY'
 'CZ' 'D' 'DA' 'DB' 'DC' 'DD' 'DE' 'DF' 'DG' 'DH' 'DI' 'DJ' 'DK' 'DL' 'DM'
 'DN' 'DO' 'DP' 'DQ' 'DR' 'DS' 'DT' 'DU' 'DV' 'DW' 'DX' 'DY' 'DZ' 'E' 'F'
 'G' 'H'