In [157]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier
from xgboost import XGBRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier

# Regularized Greedy Forest
from rgf.sklearn import RGFClassifier     # https://github.com/fukatani/rgf_python


train = pd.read_csv('./input_original_data/porto_train.csv')
test = pd.read_csv('./input_original_data/porto_test.csv')

#train=train[:3000]
#test=test[:3000]

# Preprocessing 
id_test = test['id'].values
target_train = train['target'].values

train = train.drop(['target','id'], axis = 1)
test = test.drop(['id'], axis = 1)


col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  


train = train.replace(-1, np.nan)
test = test.replace(-1, np.nan)


cat_features = [a for a in train.columns if a.endswith('cat')]

for column in cat_features:
    temp = pd.get_dummies(pd.Series(train[column]))
    train = pd.concat([train,temp],axis=1)
    train = train.drop([column],axis=1)
    
for column in cat_features:
    temp = pd.get_dummies(pd.Series(test[column]))
    test = pd.concat([test,temp],axis=1)
    test = test.drop([column],axis=1)


print(train.values.shape, test.values.shape)


def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


class Ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
#                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
#                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
#                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]

            S_test[:, i] = S_test_i.mean(axis=1)
            print( "\nGini for full training set:" )
            print(eval_gini(y,S_train[:,i]))
            
            
        S_train = pd.DataFrame(S_train)
        S_test = pd.DataFrame(S_test)
            
        S_train.columns = self.base_models
        S_test.columns = self.base_models

        return S_train, S_test

(595212, 198) (892816, 198)


In [158]:

        
# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['seed'] = 99


lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['seed'] = 99


lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['seed'] = 99


# RandomForest params
#rf_params = {}
#rf_params['n_estimators'] = 200
#rf_params['max_depth'] = 6
#rf_params['min_samples_split'] = 70
#rf_params['min_samples_leaf'] = 30


# ExtraTrees params
#et_params = {}
#et_params['n_estimators'] = 155
#et_params['max_features'] = 0.3
#et_params['max_depth'] = 6
#et_params['min_samples_split'] = 40
#et_params['min_samples_leaf'] = 18


# XGBoost params
#xgb_params = {}
#xgb_params['objective'] = 'binary:logistic'
#xgb_params['learning_rate'] = 0.04
#xgb_params['n_estimators'] = 490
#xgb_params['max_depth'] = 4
#xgb_params['subsample'] = 0.9
#xgb_params['colsample_bytree'] = 0.9  
#xgb_params['min_child_weight'] = 10


# CatBoost params
#cat_params = {}
#cat_params['iterations'] = 900
#cat_params['depth'] = 8
#cat_params['rsm'] = 0.95
#cat_params['learning_rate'] = 0.03
#cat_params['l2_leaf_reg'] = 3.5  
#cat_params['border_count'] = 8
#cat_params['gradient_iterations'] = 4


# Regularized Greedy Forest params
#rgf_params = {}
#rgf_params['max_leaf'] = 2000
#rgf_params['learning_rate'] = 0.5
#rgf_params['algorithm'] = "RGF_Sib"
#rgf_params['test_interval'] = 100
#rgf_params['min_samples_leaf'] = 3 
#rgf_params['reg_depth'] = 1.0
#rgf_params['l2'] = 0.5  
#rgf_params['sl2'] = 0.005




In [159]:

lgb_model = LGBMClassifier(**lgb_params)
lgb_model2 = LGBMClassifier(**lgb_params2)
lgb_model3 = LGBMClassifier(**lgb_params3)

#rf_model = RandomForestClassifier(**rf_params)

#et_model = ExtraTreesClassifier(**et_params)
        
#xgb_model = XGBClassifier(**xgb_params)

#cat_model = CatBoostClassifier(**cat_params)

#rgf_model = RGFClassifier(**rgf_params) 

#gb_model = GradientBoostingClassifier(max_depth=5)

#ada_model = AdaBoostClassifier()

log_model1 = LogisticRegression()
log_model2 = XGBClassifier()



#Lv1ensemble     
stack1 = Ensemble(n_splits=3,
                 base_models = (lgb_model, lgb_model2))              
s1_train,s1_test = stack1.fit_predict(train, target_train, test)        



Fit LGBMClassifier fold 1




Fit LGBMClassifier fold 2




Fit LGBMClassifier fold 3

Gini for full training set:
0.282893853809




Fit LGBMClassifier fold 1




Fit LGBMClassifier fold 2




Fit LGBMClassifier fold 3

Gini for full training set:
0.282469202091


In [160]:
#Lv2ensemble
stack2 = Ensemble(n_splits=3,
                 base_models = (log_model1,log_model2))              
s2_train,s2_test = stack2.fit_predict(s1_train, target_train, s1_test)    

Fit LogisticRegression fold 1
Fit LogisticRegression fold 2
Fit LogisticRegression fold 3

Gini for full training set:
0.284484436342
Fit XGBClassifier fold 1
Fit XGBClassifier fold 2
Fit XGBClassifier fold 3

Gini for full training set:
0.28312770299


In [161]:
def weight_ensemble(test,weight):
    weighted_test = []
    weight_list=np.zeros([test.shape[0],test.shape[1]])   
    for i,w in enumerate(weight):
        weight_list[:,i]= w/sum(weight)

    weighted_test = (test * weight_list).sum(axis=1)
    return weighted_test

weight_ensemble(s2_test,[0.2,0.8])


0         0.028959
1         0.027455
2         0.025260
3         0.015649
4         0.035432
5         0.047277
6         0.018505
7         0.036855
8         0.050238
9         0.053874
10        0.029480
11        0.023089
12        0.039857
13        0.046668
14        0.046542
15        0.024077
16        0.025719
17        0.050464
18        0.014914
19        0.055201
20        0.035974
21        0.049541
22        0.055733
23        0.015890
24        0.025344
25        0.026566
26        0.083873
27        0.043104
28        0.027693
29        0.016815
            ...   
892786    0.015736
892787    0.034619
892788    0.033705
892789    0.032678
892790    0.033499
892791    0.025594
892792    0.027698
892793    0.038002
892794    0.023505
892795    0.032331
892796    0.059384
892797    0.067647
892798    0.040592
892799    0.081336
892800    0.025653
892801    0.029072
892802    0.028188
892803    0.032094
892804    0.047403
892805    0.035946
892806    0.024496
892807    0.

In [147]:
x=np.array([[1,2],[1,2],[1,2]])
y=np.zeros([x.shape[0],x.shape[1]])
y[:,0]= 0.4
y[:,1]= 0.6

x*y

array([[ 0.4,  1.2],
       [ 0.4,  1.2],
       [ 0.4,  1.2]])

In [5]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_pred_mean
sub.to_csv('stacked_LV2.csv', index=False)

NameError: name 'y_pred_mean' is not defined

In [None]:

S2_test = np.zeros((T.shape[0], len(self.stackers)))
        
        for k,stacker in enumerate(self.stackers):

            results = cross_val_score(stacker, S_train, y, cv=3, scoring='roc_auc')
            print("Stacker score: %.5f" % (results.mean()))
        
            stacker.fit(S_train, y)
            S2_test[:, k] = stacker.predict_proba(S_test)[:,1]
            
        return S2_test