In [1]:
## build a stack regression model for loan data

In [2]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings 
warnings.filterwarnings('ignore')
from mypipes import *

  import pandas.util.testing as tm


In [3]:
file=r'../data/census_income.csv'

cd= pd.read_csv(file)

In [4]:
cd.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
cd['Y']=(cd['Y']==' >50K').astype(int)

In [6]:
cd['Y'].value_counts()

0    24720
1     7841
Name: Y, dtype: int64

In [7]:
cd_train,cd_test=train_test_split(cd,test_size=0.2,random_state=2)

In [8]:
cd_train.reset_index(drop=True,inplace=True)
cd_test.reset_index(drop=True,inplace=True)

In [9]:
cat_vars=list(cd_train.select_dtypes(include=['object']).columns)

In [10]:
cat_vars=[_ for _ in cat_vars if _ not in ['Y','education']]

In [11]:
num_vars=list(cd_train.select_dtypes(exclude=['object']).columns)

In [12]:
p1=pdPipeline([
    ('cat_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(300))
])

p2=pdPipeline([
    ('num_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('cat_pipe',p1),
    ('num_pipe',p2)
])

In [13]:
data_pipe.fit(cd_train)

FeatureUnion(transformer_list=[('cat_pipe',
                                pdPipeline(steps=[('cat_select',
                                                   VarSelector(feature_names=['workclass',
                                                                              'marital.status',
                                                                              'occupation',
                                                                              'relationship',
                                                                              'race',
                                                                              'sex',
                                                                              'native.country'])),
                                                  ('missing_trt',
                                                   DataFrameImputer()),
                                                  ('create_dummies',
                                         

In [14]:
x_train=pd.DataFrame(data=data_pipe.transform(cd_train),
                     columns=data_pipe.get_feature_names())
x_test=pd.DataFrame(data=data_pipe.transform(cd_test),
                     columns=data_pipe.get_feature_names())
y_train=cd_train['Y']
y_test=cd_test['Y']

In [33]:
x_train

Unnamed: 0,cat_pipe__workclass_ Private,cat_pipe__workclass_ Self-emp-not-inc,cat_pipe__workclass_ Local-gov,cat_pipe__workclass_ ?,cat_pipe__workclass_ State-gov,cat_pipe__workclass_ Self-emp-inc,cat_pipe__workclass_ Federal-gov,cat_pipe__marital.status_ Married-civ-spouse,cat_pipe__marital.status_ Never-married,cat_pipe__marital.status_ Divorced,...,cat_pipe__native.country_ United-States,cat_pipe__native.country_ Mexico,cat_pipe__native.country_ ?,num_pipe__age,num_pipe__fnlwgt,num_pipe__education.num,num_pipe__capital.gain,num_pipe__capital.loss,num_pipe__hours.per.week,num_pipe__Y
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,31,247444,11,0,0,40,0
1,0,0,0,0,1,0,0,0,0,0,...,1,0,0,37,34996,9,0,0,40,0
2,1,0,0,0,0,0,0,0,1,0,...,1,0,0,18,194059,8,0,0,40,0
3,1,0,0,0,0,0,0,1,0,0,...,1,0,0,38,224566,11,0,0,50,0
4,1,0,0,0,0,0,0,0,1,0,...,1,0,0,30,363296,9,0,0,72,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0,0,0,0,0,1,0,1,0,0,...,1,0,0,59,223215,13,0,1977,50,1
26044,1,0,0,0,0,0,0,1,0,0,...,1,0,0,25,169124,9,0,0,45,0
26045,0,0,0,0,0,0,1,0,1,0,...,1,0,0,46,43206,16,0,1564,50,1
26046,1,0,0,0,0,0,0,0,1,0,...,1,0,0,19,446219,6,0,0,40,0


In [35]:
clf1=KNeighborsClassifier(n_neighbors=50)
clf2=RandomForestClassifier(class_weight='balanced',n_estimators=200)
clf3=RandomForestClassifier(class_weight=None,n_estimators=100)
clf4=RandomForestClassifier(n_estimators=50)
clf5=RandomForestClassifier(n_estimators=10)

Algos=[clf1,clf2,clf3,clf4,clf5]

In [36]:
rows=x_train.shape[0]

In [37]:
rows

26048

In [38]:
layer1=pd.DataFrame({'clf'+str(i):np.zeros(rows) for i in range(1,len(Algos)+1)})

In [39]:
layer1.shape

(26048, 5)

In [40]:
layer1

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
26043,0.0,0.0,0.0,0.0,0.0
26044,0.0,0.0,0.0,0.0,0.0
26045,0.0,0.0,0.0,0.0,0.0
26046,0.0,0.0,0.0,0.0,0.0


In [41]:
kf=KFold(n_splits=10)

In [43]:
fold=1
for train,left_out_chunk in kf.split(x_train):
    print('fold number : ', fold)
    
    for i,clf in enumerate(Algos):
        print('Algo number :',i+1)
        
        x_train_train=x_train.loc[train]
        y_train_train=y_train[train]
        
        x_train_left_out_chunk=x_train.loc[left_out_chunk]
        
        
        clf.fit(x_train_train,y_train_train)
        p=clf.predict_proba(x_train_left_out_chunk)[:,1]
        
        layer1.iloc[left_out_chunk,i]=p
        
    fold+=1
    print("*"*50)
    print()
    

fold number :  1
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
**************************************************

fold number :  2
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
**************************************************

fold number :  3
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
**************************************************

fold number :  4
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
**************************************************

fold number :  5
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
**************************************************

fold number :  6
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
**************************************************

fold number :  7
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
*********

In [22]:
layer1

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.16,0.000,0.01,0.00,0.0
1,0.12,0.000,0.00,0.00,0.0
2,0.10,0.000,0.00,0.00,0.0
3,0.18,0.005,0.01,0.00,0.1
4,0.20,0.000,0.00,0.00,0.0
...,...,...,...,...,...
26043,0.14,0.990,0.98,0.98,1.0
26044,0.12,0.010,0.01,0.00,0.0
26045,0.32,0.955,0.93,0.96,1.0
26046,0.14,0.000,0.00,0.00,0.0


In [44]:
rows=x_test.shape[0]
layer2_test=pd.DataFrame({'clf'+str(i):np.zeros(rows) for i in range(1,len(Algos)+1)})

In [45]:
layer2_test.shape

(6513, 5)

In [46]:
for i,clf in enumerate(Algos):
    
    print( 'Algo number',i+1)
    clf.fit(x_train,y_train)
    p=clf.predict_proba(x_test)[:,1]
    
    layer2_test.iloc[:,i]=p


Algo number 1
Algo number 2
Algo number 3
Algo number 4
Algo number 5


In [26]:
# layer2_test

In [47]:
# second layer linear model 
logr=LogisticRegression(class_weight='balanced')


In [48]:
logr.fit(layer1,y_train)

LogisticRegression(class_weight='balanced')

In [49]:
roc_auc_score(y_test,logr.predict_proba(layer2_test)[:,1])

1.0

In [50]:
xgb2=XGBClassifier(objective='binary:logistic',n_estimators=100,
                   max_depth=3,learning_rate=.1,scale_pos_weight=3)

In [51]:
xgb2.fit(layer1,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=3, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [57]:
roc_auc_score(y_test,xgb2.predict_proba(layer2_test)[:,1])

1.0

In [56]:
xgb2.predict(layer2_test)

array([1, 0, 0, ..., 1, 0, 0])