In [161]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

class Clf4Stack(object):
    def __init__(self, model, n_splits=5):
        self.n_splits = n_splits
        self.model = model

    def fit_predict(self, trainX, trainy, testX):

        self.train4stack = np.zeros(len(trainX))
        self.test4stack = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=44)

        for train_index, test_index in skf.split(trainX, trainy):
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict_proba(X_test)[:,1]
            self.train4stack[test_index] = y_pred
            self.test4stack += self.model.predict_proba(testX)[:,1]
        
        self.test4stack /= self.n_splits
            
    def output(self,train_file_name='train4stack.csv',
                    test_file_name='test4stack.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stack}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stack}).to_csv(test_file_name,index=False)

In [162]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(50,n_jobs=-1)

In [163]:
! ls /Users/guoli/Desktop/kaggle/Porto/train.csv

/Users/guoli/Desktop/kaggle/Porto/train.csv


In [175]:
df = pd.read_csv("/Users/guoli/Desktop/kaggle/Porto/train.csv")

In [176]:
df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [177]:
features = list(df.columns)
target = 'target'
features.remove(target)

In [178]:
from sklearn.model_selection import train_test_split

X = np.array(df[features])
y = np.array(df[target])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [179]:
model = Clf4Stack(rf)

In [180]:
model.fit_predict(X_train,y_train,X_test)

In [181]:
model.output()

In [182]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [183]:
gini_normalized(model.train4stack,y_train)

0.020446514001250183

In [184]:
gini_normalized(model.test4stack,y_test)

0.027601753702345012

In [185]:
!ls

Clf4Stack.py    Untitled.ipynb  test4stack.csv  train4stack.csv
