In [None]:
# Metric from Kaggle
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_coefficient(preds,dtrain):
    y = dtrain.get_label()
    return 'gini', -gini_normalized(y,preds)

In [None]:
# Faster Gini calculation https://www.kaggle.com/tezdhar/faster-gini-calculation
from sklearn import metrics
#Remove redundant calls
def ginic(actual, pred):
    actual = np.asarray(actual) #In case, someone passes Series or list
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n

In [None]:
# Grid search from Liguo (for reference only)
# from sklearn.model_selection import KFold

# class CustomGridCV(object):
#     def __init__(self, X, y, model, metric, griddata, cv=3):
#         self.X = X
#         self.y = y
#         self.model = model
#         self.metric = metric
#         self.params = self.gridpoints(griddata)
#         self.cv = cv
#         self.bestScore = None
#         self.bestParams = None
        
#     def gridpoints(self, data):
#         newparams = [{}]
#         for k in data.keys():
#             params = newparams
#             newparams = []
#             for v in data[k]:
#                 for param in params:
#                     item = param.copy()
#                     item[k]=v
#                     newparams.append(item)           
#         return newparams
    
#     def GridSearch(self):
#         for param in self.params:
#             self.model.set_params(**param)
#             score = self.KFoldScore()
#             if self.bestScore==None or self.bestScore<score:
#                 self.bestScore = score
#                 self.bestParams = param
#             print("Score: {0:.5f}, Params: {1}".format(score,param))
    
#     def KFoldScore(self):
#         kf = KFold(n_splits=5, shuffle=True, random_state=2)
#         y_pred = np.zeros(len(self.y))

#         for train_index, test_index in kf.split(self.X):
#             train_X, test_X = self.X[train_index], self.X[test_index]
#             train_y, test_y = self.y[train_index], self.y[test_index]
#             self.model.fit(train_X,train_y)
#             y_pred[test_index] = self.model.predict_proba(test_X)[:,1]

#         return self.metric(self.y,y_pred)
    
#     def Best(self):
#         return self.bestScore, self.bestParams



# Using Grid search tuing hyperParameters:
# from sklearn.ensemble import RandomForestClassifier

# # Read Data
# trainpath = "E:/datascience/kaggle/porto_seguro/train.csv"
# X, y = PrepareData(trainpath,nrows=10000)

# # Select a Model
# model = RandomForestClassifier(n_jobs=-1, random_state=1111)

# # Set the ranges for parameters
# griddata = {"n_estimators":[30,50],
#             "max_features": range(...),
#             "min_samples_leaf": range(...),
#             "max_depth": [..]}

# # Grid Search for the best parameters
# GCV = CustomGridCV(X, y, model, gini_normalized, griddata)

# GCV.GridSearch()

# print "Best Params:"
# print GCV.Best()

In [None]:
# Prepare data
import pandas as pd
import numpy as np

def PrepareData(trainpath,nrows=None):
    df = pd.read_csv(trainpath, index_col='id', nrows=nrows)
    # feature engineering (from kaggle kernel)
    ps_cal = df.columns[df.columns.str.startswith('ps_calc')]
    df = df.drop(ps_cal,axis=1)
    
    # sampling = np.random.choice(595212, replace=False, size=10000)
    # df = df.iloc[sampling]

    features = list(df.columns)
    target = 'target'
    features.remove(target)

    X = np.array(df[features])
    y = np.array(df[target])
    return X, y

# Read Data
trainpath = "E:/datascience/kaggle/porto_seguro/train.csv"
X, y = PrepareData(trainpath)

In [None]:
from __future__ import print_function
from __future__ import division
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.svm import SVC
from bayes_opt import BayesianOptimization

In [None]:
def gini_normalizedc(a, p):
    if p.ndim == 2:#Required for sklearn wrapper
        p = p[:,1] #If proba array contains proba for both 0 and 1 classes, just pick class 1
    return ginic(a, p) / ginic(a, a)
gini_sklearnf = metrics.make_scorer(gini_normalizedc, True, True)

# build extra tree classifier cross validation model
def etccv(n_estimators, max_features, max_depth, min_samples_leaf):
    val = cross_val_score(
        ETC(n_estimators=int(n_estimators),
            max_features=int(max_features),
            max_depth=int(max_depth),
            min_samples_leaf=int(min_samples_leaf)),
        X, y, scoring=gini_sklearnf, cv=3
    ).mean()
    return val

In [None]:
# build function to tune hyper_parameters with Bayesian optimization
# using prior probability to predict, much faster than Grid search
if __name__ == "__main__":
    gp_params = {"alpha": 1e-5}

    etcBO = BayesianOptimization(
        etccv,
        {'n_estimators': (300, 300),
         'max_features': (5,10),
         'max_depth': (1, 10),
         'min_samples_leaf': (100, 100)
        }
    )

    etcBO.maximize(n_iter=10, **gp_params)
    print('-' * 53)
    print('Final Results')
    #print('SVC: %f' % svcBO.res['max']['max_val'])
    print('ETC: %f' % etcBO.res['max']['max_val'])

In [None]:
# Running model and Predicting on testing dataset
ETC_final=ETC(n_estimators=500, max_features=10, max_depth=10, min_samples_leaf=100, random_state=2, verbose=True, n_jobs=-1)
ETC_final.fit(X,y)

df_test=pd.read_csv("E:/datascience/kaggle/porto_seguro/test.csv")
ps_cal = df_test.columns[df_test.columns.str.startswith('ps_calc')]
df_test = df_test.drop(ps_cal,axis=1)
features=list(df_test.columns.values)
features.remove('id')
y_test = ETC_final.predict_proba(df_test[features])
subm = pd.DataFrame()
subm['id'] = df_test['id']
subm['target'] = y_test[:,1]
subm.to_csv('E:/datascience/kaggle/porto_seguro/mysubmission.csv', index=False)

In [None]:
# Prepare model data for stacking (from Liguo)
from sklearn.model_selection import StratifiedKFold
class Clf4Stack(object):
    def __init__(self, model, n_splits=5):
        self.n_splits = n_splits
        self.model = model

    def fit_predict(self, trainX, trainy, testX):

        self.train4stack = np.zeros(len(trainX))
        self.test4stack = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=44)

        for train_index, test_index in skf.split(trainX, trainy):
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict_proba(X_test)[:,1]
            self.train4stack[test_index] = y_pred
            self.test4stack += self.model.predict_proba(testX)[:,1]
        
        self.test4stack /= self.n_splits
            
    def output(self,train_file_name='train4stack.csv',
                    test_file_name='test4stack.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stack}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stack}).to_csv(test_file_name,index=False)


C4S=Clf4Stack(ETC_final)
C4S.fit_predict(X, y, df_test[features])
C4S.output(train_file_name="E:/datascience/kaggle/porto_seguro/train4stack.csv", 
           test_file_name="E:/datascience/kaggle/porto_seguro/test4stack.csv", 
           col_name="E:/datascience/kaggle/porto_seguro/F4stack")