In [3]:
import pandas as pd
import numpy as np

def FeatureEngineering(trainpath,testpath):
    #### Load Data
    train = pd.read_csv(trainpath,nrows=10000)
    test = pd.read_csv(testpath,nrows=10000)

    ### 
    y = train['target'].values
    testid= test['id'].values

    train.drop(['id','target'],axis=1,inplace=True)
    test.drop(['id'],axis=1,inplace=True)

    ### Drop calc
    unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
    train = train.drop(unwanted, axis=1)  
    test = test.drop(unwanted, axis=1)

    ### Great Recovery from Pascal's materpiece

    def recon(reg):
        integer = int(np.round((40*reg)**2)) 
        for a in range(32):
            if (integer - a) % 31 == 0:
                A = a
        M = (integer - A)//31
        return A, M
    train['ps_reg_A'] = train['ps_reg_03'].apply(lambda x: recon(x)[0])
    train['ps_reg_M'] = train['ps_reg_03'].apply(lambda x: recon(x)[1])
    train['ps_reg_A'].replace(19,-1, inplace=True)
    train['ps_reg_M'].replace(51,-1, inplace=True)
    test['ps_reg_A'] = test['ps_reg_03'].apply(lambda x: recon(x)[0])
    test['ps_reg_M'] = test['ps_reg_03'].apply(lambda x: recon(x)[1])
    test['ps_reg_A'].replace(19,-1, inplace=True)
    test['ps_reg_M'].replace(51,-1, inplace=True)
    
    trainX = train
    testX = test
    trainy = y
    
    return trainX, trainy, testX

In [12]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_coefficient(preds,dtrain):
    y = dtrain.get_label()
    return 'gini', -gini_normalized(y,preds)

In [13]:
trainpath = "/Users/guoli/Desktop/kaggle/Porto/train.csv"
testpath = "/Users/guoli/Desktop/kaggle/Porto/test.csv"

trainX, trainy, testX = FeatureEngineering(trainpath,testpath)

In [21]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

params = {'eta': 0.025, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'alpha':4,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'seed': 99, 
          'silent': True}

xgb = XGBClassifier()
xgb.set_params(**params)
x1, x2, y1, y2 = train_test_split(trainX, trainy, test_size=0.25, random_state=99)

xgb.fit(x1, y1, 
        eval_set=[(x1,y1),(x2,y2)], 
        eval_metric=gini_coefficient,
        early_stopping_rounds=100,
        verbose=10)

# x1, x2, y1, y2 = train_test_split(train, y, test_size=0.25, random_state=99)
# watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]

# model = xgb.train(params, xgb.DMatrix(x1, y1), 5000,  watchlist, feval=gini_xgb, maximize=True, 
#                   verbose_eval=100, early_stopping_rounds=70)


ValueError: Invalid parameter eval_metric for estimator XGBClassifier. Check the list of available parameters with `estimator.get_params().keys()`.

In [19]:
class Clf4Stack(object):
    def __init__(self, model, n_splits=5):
        self.n_splits = n_splits
        self.model = model

    def fit_predict(self, trainX, trainy, testX):

        self.train4stack = np.zeros(len(trainX))
        self.test4stack = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=44)

        for train_index, test_index in skf.split(trainX, trainy):
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict_proba(X_test)[:,1]
            self.train4stack[test_index] = y_pred
            self.test4stack += self.model.predict_proba(testX)[:,1]
        
        self.test4stack /= self.n_splits
            
    def output(self,train_file_name='train4stack.csv',
                    test_file_name='test4stack.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stack}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stack}).to_csv(test_file_name,index=False)