In which I use extremely randomized trees. See here:
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

Also see: 
[GEW2006] P. Geurts, D. Ernst., and L. Wehenkel, “Extremely randomized trees”, Machine Learning, 63(1), 3-42, 2006.

In [None]:
#Import libraries:
import numpy as np
import pandas as pd
import xgboost as xgb
import time
#load data:
# train = pd.read_csv("train.csv")
# target = train['target']
# #drop targets & (unique row) IDs from training data
# train = train.drop(['ID','target'],axis=1)
# test = pd.read_csv("test.csv")
# IDs = test['ID'].values
# test = test.drop(['ID'],axis=1)

# PREPROCESSING

In [None]:
#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code
train = pd.read_csv("train.csv")
target = train['target']
train = train.drop(['ID','target'],axis=1)
test = pd.read_csv("test.csv")
ids = test['ID'].values
test = test.drop(['ID'],axis=1)
#

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()  #TODO

In [None]:
# https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19133/feature-engineering-for-beginners
#check this out: https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19240/analysis-of-duplicate-variables-correlated-variables-large-post

A function to report best hyperparameters:
http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html

In [None]:
from operator import itemgetter
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

# TESTING EXTRATREES CLASSIFIER APPROACH

In [None]:
#https://www.kaggle.com/yuhaichina/bnp-paribas-cardif-claims-management/extratreesclassifier-score-0-45911
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

t0 = time.time()
X_train = train
X_test = test
extc = ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 5,
                            max_depth= 50, min_samples_leaf= 5)      

extc.fit(X_train,target) 
preds = extc.predict_proba(X_test)[:,1]
t1 = time.time()
total_time = t1 - t0
print total_time

In [None]:
import csv
predictions_file = open("extc.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(IDs, preds[:,1]))
predictions_file.close()

# RandomizedSearchCV on ETC parameter space

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import metrics

t0 = time.time()
X_train = train
X_test = test

param_grid = {'max_depth': range(20,50),
                      'n_estimators': np.arange(500,1000,100),
#                         'n_estimators': [10],
                      'max_features' : np.arange(10,110,10),
                      'min_samples_split' : np.arange(1,6,1),
                        'min_samples_leaf' : np.arange(1,6,1),
                      'criterion' : ['entropy'],
                      #'scale_pos_weight': [0.5, 1]
                      #'model__eta':[0.01,0.02],
                     #'model__scale_pos_weight':[0.8,1.0]
                      #'model__silent':[1],
                      }



extc = ExtraTreesClassifier()

n_iter_search=50
random_search = RandomizedSearchCV(extc, param_distributions=param_grid,
                                   n_iter=n_iter_search, scoring ="log_loss")

random_search.fit( train , target)

t1 = time.time()
total_time = t1 - t0
print total_time

In [None]:
print report(random_search.grid_scores_)

In [None]:
params = {'min_samples_leaf': 2, 
         'n_estimators': 800, 
         'max_features': 50, 
         'criterion': 'entropy', 
         'min_samples_split': 2, 
         'max_depth': 29}
t0 = time.time()
extc2 = ExtraTreesClassifier(**params)
#OR
#extc2 = ExtraTreesClassifier(**random_search.best_params_)
extc2.fit(X_train,target) 
preds_2 = extc.predict_proba(X_test)[:,1]
t1 = time.time()
total_time = t1 - t0
print total_time

In [None]:
import csv
predictions_file = open("extc2_rcv.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(ids, preds))
predictions_file.close()

This gave -logloss = 0.45450.

In [None]:
feat_import = extc.feature_importances_

In [None]:
print np.shape(feat_import)
print feat_import

# DO THE SAME NOW, INCLUDING A BIT MORE PREPROCESSING

Variation on extc benchmark. See here: 
https://www.kaggle.com/mujtabaasif/bnp-paribas-cardif-claims-management/extratrees

In [None]:
#this is the benchmark:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble


print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -999 
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -999

X_train = train
X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      

extc.fit(X_train,target) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees.csv',index=False)

In [None]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Clearing...')

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()
            
X_train = train
X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 40, min_samples_leaf= 2, n_jobs = -1)      

extc.fit(X_train,target) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees_impute.csv',index=False)

OK now, using the preprocessing above, let's use RandomizedSearchCV!

In [None]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

import time
from sklearn.grid_search import RandomizedSearchCV
from sklearn import metrics

t0 = time.time()

print('Load data...')
train = pd.read_csv("train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)

print('Cleaning...')

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = train_series.mean()
X_train = train
X_test = test


print('Training...')

param_grid = {'n_estimators': [850],
#                         'n_estimators': [10],
                      'max_features' : np.arange(50,90,5),
                      'min_samples_split' : np.arange(2,5,1),
                        'min_samples_leaf' : np.arange(1,4,1),
                      'criterion' : ['entropy'],
                      #'scale_pos_weight': [0.5, 1]
                      #'model__eta':[0.01,0.02],
                     #'model__scale_pos_weight':[0.8,1.0]
                      #'model__silent':[1],
                      }



extc = ExtraTreesClassifier(verbose = 10 )

n_iter_search= 15
random_search = RandomizedSearchCV(extc, param_distributions=param_grid,
                                   n_iter=n_iter_search, scoring ="log_loss" , n_jobs = -1,
                                  verbose = 10)

random_search.fit( X_train , target)

t1 = time.time()
total_time = t1 - t0
print total_time

Load data...
Cleaning...
Training...
Fitting 3 folds for each of 15 candidates, totalling 45 fits




In [None]:
#https://www.kaggle.com/scirpus/bnp-paribas-cardif-claims-management/benouilli-naive-bayes/code
#https://www.kaggle.com/chabir/bnp-paribas-cardif-claims-management/extratreesclassifier-score-0-45-v5/discussion
