In [None]:
import pandas as pd
import numpy as np
import ml_utils as mt
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost.sklearn import XGBClassifier,XGBRegressor
from catboost import CatBoostRegressor

In [None]:
ld_train=pd.read_csv('loan_data_train.csv')

In [None]:
def dtr(orig_col):
    
    mod_col=orig_col.str.replace('%','')
    mod_col=pd.to_numeric(mod_col,errors='coerce')
    
    return mod_col

def fico(orig_col):
    k=orig_col.str.split('-',expand=True)
    
    for i in [0,1]:
        k[i]=pd.to_numeric(k[i],errors='coerce')
    
    mod_col=0.5*(k[0]+k[1])
    
    return mod_col

def el(orig_col):
    
    inter_col=orig_col.str.replace('10+ years','10',regex=False)
    inter_col=inter_col.str.replace('< 1 year','0',regex=False)
    inter_col=inter_col.str.replace('years','').str.replace('year','')
    
    mod_col=pd.to_numeric(inter_col,errors='coerce')
    
    return mod_col

cat_to_dummies_cols=['Loan.Length','Loan.Purpose','State','Home.Ownership']
cat_to_num_cols=['Amount.Requested','Open.CREDIT.Lines','Revolving.CREDIT.Balance']
simple_num_cols=['Monthly.Income','Inquiries.in.the.Last.6.Months']
custom_func_dict_cols={'Debt.To.Income.Ratio':dtr,'FICO.Range':fico,'Employment.Length':el}

data_pipe=mt.DataPipe(cat_to_dummies= cat_to_dummies_cols,
                     cat_to_num=cat_to_num_cols,
                     simple_num=simple_num_cols,
                     custom_func_dict=custom_func_dict_cols)

data_pipe.fit(ld_train)

x_train_reg=data_pipe.transform(ld_train)
y_train_reg=ld_train['Interest.Rate'].str.replace('%','').astype(float)

In [None]:
x_train_reg.shape

In [None]:
params_gbm={
    'n_estimators':[50,100,150,200,250],
    'learning_rate':[.01,.05,0.1,0.4,0.8],# 1>learing_rate>0
    'max_depth':[1,2,3,4,5,6],
#    'min_samples_split': we can ignore these because they might not matter, since we are keeping the depth low
#    'min_samples_leaf':
    'subsample':[0.5,0.6,0.7,0.8,0.9,1] ,
    'max_features':[5,10,15,20,30,40,48]
}

In [None]:
reg_gbm=GradientBoostingRegressor()

In [None]:
rs_gbm=RandomizedSearchCV(reg_gbm,
                         scoring='neg_mean_absolute_error',
                         param_distributions=params_gbm,
                         cv=10,
                         n_iter=50,
                         n_jobs=-1)

In [None]:
rs_gbm.fit(x_train_reg,y_train_reg)

In [None]:
mt.report(rs_gbm.cv_results_,5)

In [None]:
# I am using a specific set of parameters which were best for my quick run
# in your iteration this might be different 
reg_gbm_final=GradientBoostingRegressor(**{'subsample': 0.5, 'n_estimators': 150, 
                                           'max_features': 48,
                                           'max_depth': 4, 'learning_rate': 0.05})

In [None]:
reg_gbm_final.fit(x_train_reg,y_train_reg)

In [None]:
reg_gbm_final.predict(x_train_reg)

# gbm for classification

In [None]:
bd_train=pd.read_csv('bd_train.csv')

In [None]:
def children_to_num(col):
    
    num_col=col.str.replace('Zero','0')
    num_col=num_col.str.replace('4+','4',regex=False)
    num_col=pd.to_numeric(num_col,errors='coerce')
    
    return num_col

def ab_to_num(col):
    
    col=col.str.replace('71+','71-71',regex=False)
    k=col.str.split('-',expand=True)
    
    for i in [0,1]:
        k[i]=pd.to_numeric(k[i],errors='coerce')
        
    num_col=0.5*(k[0]+k[1])
    
    return num_col

def fi_to_num(col):
    
    col=col.replace({'<10,000, >= 8,000':9000, '>=35,000':35000, '<25,000, >=22,500':23750,
       '<20,000, >=17,500':18750, '<12,500, >=10,000':11250, '<30,000, >=27,500':28750,
       '<27,500, >=25,000':26250, '<17,500, >=15,000':16250, '<15,000, >=12,500':13750,
       '<22,500, >=20,000':21250,'< 4,000': 4000, '< 8,000, >= 4,000':6000})
    num_col=pd.to_numeric(col,errors='coerce')
    
    return num_col

simple_numeric_cols=['year_last_moved','Average.Credit.Card.Transaction', 'Balance.Transfer',
      'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
      'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
      'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount','Investment.in.Commudity',
      'Investment.in.Equity', 'Investment.in.Derivative',
      'Portfolio.Balance']

cat_to_dummies_cols=['status' , 'occupation' , 'occupation_partner' , 'home_status', 'self_employed',
'self_employed_partner','TVarea','gender','region']

custom_function_cols={'children':children_to_num,'age_band':ab_to_num,'family_income':fi_to_num}

data_pipe=mt.DataPipe(simple_num=simple_numeric_cols,
                     cat_to_dummies=cat_to_dummies_cols,
                     custom_func_dict=custom_function_cols)

data_pipe.fit(bd_train)

x_train_cls=data_pipe.transform(bd_train)

y_train_cls=(bd_train['Revenue.Grid']==1).astype(int)

In [None]:
x_train_cls.shape

In [None]:
params_gbm={
    'n_estimators':[50,100,150,200,250,300,400,500],
    'learning_rate':[.01,.05,0.1,0.4,0.8],# 1>learing_rate>0
    'max_depth':[1,2,3,4,5,6],
#    'min_samples_split': we can ignore these because they might not matter, since we are keeping the depth low
#    'min_samples_leaf':
    'subsample':[0.5,0.6,0.7,0.8,0.9,1] ,
    'max_features':[5,10,15,20,30,40,50,60,71]
}

In [None]:
cls_gbm=GradientBoostingClassifier()

In [None]:
rs_gbm_cl=RandomizedSearchCV(cls_gbm,
                            scoring='roc_auc',
                            param_distributions=params_gbm,
                            cv=10,
                            n_iter=10,
                            n_jobs=-1)

In [None]:
rs_gbm_cl.fit(x_train_cls,y_train_cls)

In [None]:
mt.report(rs_gbm_cl.cv_results_,5)

In [None]:
# Leaving the part where you train your final model separately 

# XGB for Regression

In [None]:
params_xgb={
            'learning_rate':[0.01,0.05,0.1,0.3,0.4,0.5], # in your first run dont go beyond 0.5, expand if needed later
            'n_estimators':[50,100,150,200,300,400],
            
            'gamma':[i/10 for i in range(0,10)], # gamma can take any value from 0 to inf, penalty on tree size
            'max_depth':[2,3,4,5,6,7,8], # start with smaller values
            'min_child_weight':[1,2,5,10] , # this serves similar purpose as min obs in node in gbm
    
            'subsample':[i/10 for i in range(5,11)], # fraction of observations being picked for a tree
            'colsample_bytree':[i/10 for i in range(4,11)], # fraction of features being picked for a tree
            'colsample_bylevel':[i/10 for i in range(4,11)], # fraction of features being picked at each node
                                                             # this works on top of features selected for the tree   
             
            'reg_lambda':[1e-5,.001,0.1,1,10,100], # l2 penalty on predictions
            'reg_alpha': [1e-5,.001,0.1,1,10,100],  # l1 penalty on predictions
            'max_delta_step':[0.1,0.5,1,2,5] # failsafe upper limit on predictions 
            
}

In [None]:
reg_xgb=XGBRegressor(objective='reg:squarederror')

In [None]:
rs_xgb=RandomizedSearchCV(reg_xgb,
                         scoring='neg_mean_absolute_error',
                         param_distributions=params_xgb,
                         cv=10,
                         n_iter=50,
                         n_jobs=-1)

In [None]:
rs_xgb.fit(x_train_reg,y_train_reg)

In [None]:
mt.report(rs_xgb.cv_results_,5)

# Catboost Regressor

In [None]:
# for catboost we are going to leave categorical columns as is and it will be handled by catboost implementation internally

cat_to_dummies_cols=['Loan.Length','Loan.Purpose','State','Home.Ownership']
cat_to_num_cols=['Amount.Requested','Open.CREDIT.Lines','Revolving.CREDIT.Balance']
simple_num_cols=['Monthly.Income','Inquiries.in.the.Last.6.Months']
custom_func_dict_cols={'Debt.To.Income.Ratio':dtr,'FICO.Range':fico,'Employment.Length':el}
cat_pipe=mt.DataPipe(cat_to_dummies= cat_to_dummies_cols,
                     cat_to_num=cat_to_num_cols,
                     simple_num=simple_num_cols,
                     custom_func_dict=custom_func_dict_cols,
                    for_catboost=True)

In [None]:
cat_pipe.fit(ld_train)

In [None]:
x_train_reg_cb=cat_pipe.transform(ld_train)

In [None]:
params_cat_reg = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.4, 0.5],
    'depth': [2, 3, 4, 5, 6, 7, 8],
    'l2_leaf_reg': [1e-5, 0.001, 0.1, 1, 10, 100],
    'iterations': [50, 100, 150, 200, 300, 400],
    'bagging_temperature': [0, 0.25, 0.5, 1, 2, 5],
    'random_strength': [1e-9, 1e-5, 0.001, 0.1, 1, 10],
    'rsm': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_data_in_leaf': [1, 2, 5, 10],
    'max_bin': [128, 254, 512]
}

In [None]:
reg_cat = CatBoostRegressor(
    loss_function='MAE',
    verbose=0,
    random_state=42
)

rs_cat = RandomizedSearchCV(
    estimator=reg_cat,
    param_distributions=params_cat_reg,
    scoring='neg_mean_absolute_error',
    cv=10,
    n_iter=50,
    n_jobs=-1
)

rs_cat.fit(x_train_reg_cb, y_train_reg, **{'cat_features': cat_to_dummies_cols})

In [None]:
mt.report(rs_cat.cv_results_,5)

In [None]:
# Similar mechanics can be followed for classification problems as well , leaving that for students 