# 1. Custom class for Grid cv Search

In [2]:
from sklearn.model_selection import KFold

class CustomGridCV(object):
    def __init__(self, X, y, model, metric, griddata, cv=3):
        self.X = X
        self.y = y
        self.model = model
        self.metric = metric
        self.params = self.gridpoints(griddata)
        self.cv = cv
        self.bestScore = None
        self.bestParams = None
        
    def gridpoints(self, data):
        newparams = [{}]
        for k in data.keys():
            params = newparams
            newparams = []
            for v in data[k]:
                for param in params:
                    item = param.copy()
                    item[k]=v
                    newparams.append(item)           
        return newparams
    
    def GridSearch(self):
        for param in self.params:
            self.model.set_params(**param)
            score = self.KFoldScore()
            if self.bestScore==None or self.bestScore<score:
                self.bestScore = score
                self.bestParams = param
            print("Score: {0:.5f}, Params: {1}".format(score,param))
    
    def KFoldScore(self):
        kf = KFold(n_splits=5, shuffle=True, random_state=2)
        y_pred = np.zeros(len(self.y))

        for train_index, test_index in kf.split(self.X):
            train_X, test_X = self.X[train_index], self.X[test_index]
            train_y, test_y = self.y[train_index], self.y[test_index]
            self.model.fit(train_X,train_y)
            y_pred[test_index] = self.model.predict_proba(test_X)[:,1]

        return self.metric(self.y,y_pred)
    
    def Best(self):
        return self.bestScore, self.bestParams

# 3. Data Preparation 
### Add your feature engineering work here

In [1]:
import pandas as pd
import numpy as np

def PrepareData(trainpath,nrows=None):
    df = pd.read_csv(trainpath,nrows=nrows)

    features = list(df.columns)
    target = 'target'
    features.remove(target)

    X = np.array(df[features])
    return X, X

# 4. Search for the best hyperparameters
## 4.1 The whole hyperparameter space is huge, we are not able to search the whole space
## 4.2 We don't have to use all the data for hyperparameter searching. Tune nrows to determine the size of data considered. Have a look at learning curve.

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Read Data
trainpath = "/Users/guoli/Desktop/kaggle/Porto/train.csv"
X, y = PrepareData(trainpath,nrows=10000)

# Select a Model
model = RandomForestClassifier()

# Set the ranges for parameters
griddata = {"n_estimators":[30,50],
            "min_samples_split": range(2,4),
            "min_samples_leaf": range(2,4)}

# Grid Search for the best parameters
GCV = CustomGridCV(X, y, model, gini_normalized, griddata)

GCV.GridSearch()

print "Best Params:"
print GCV.Best()

Score: 0.16299, Params: {'n_estimators': 30, 'min_samples_split': 2, 'min_samples_leaf': 2}
Score: 0.16298, Params: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2}
Score: 0.16038, Params: {'n_estimators': 30, 'min_samples_split': 3, 'min_samples_leaf': 2}
Score: 0.21877, Params: {'n_estimators': 50, 'min_samples_split': 3, 'min_samples_leaf': 2}
Score: 0.14743, Params: {'n_estimators': 30, 'min_samples_split': 2, 'min_samples_leaf': 3}
Score: 0.19304, Params: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 3}
Score: 0.15883, Params: {'n_estimators': 30, 'min_samples_split': 3, 'min_samples_leaf': 3}
Score: 0.19851, Params: {'n_estimators': 50, 'min_samples_split': 3, 'min_samples_leaf': 3}
Best Params:
(0.21876589770782301, {'n_estimators': 50, 'min_samples_split': 3, 'min_samples_leaf': 2})
