# Final Kaggle Models

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier, StackingRegressor, StackingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
from sklearn.neighbors import KNeighborsRegressor


#Libraries for visualizing trees
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus

# Linear Regression Libraries
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# MARS
from pyearth import Earth
import time as time

import xgboost as xgb
from sklearn.ensemble import VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier, GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier


np.warnings.filterwarnings('ignore')

In [2]:
# original data
data = pd.read_csv('train.csv')
validation_data = pd.read_csv('test.csv')

In [3]:
# data cleaning
state_freqs = data.addr_state.value_counts().to_frame().reset_index().rename(columns={'index' : "state", "addr_state" : "count"})
other_state = state_freqs[state_freqs['count'] < data.shape[0] * .01]['state'].tolist()

def clean(data):
    
    df = data.copy()
    # addr_state: 'other' category
    for obs in range(df.shape[0]):
        if df.loc[obs, 'addr_state'] in other_state:
            df.loc[obs, 'addr_state'] = 'Other'
            
    # earliest_cr_line: split month, year
    df['earliest_cr_line_month'] = df.earliest_cr_line.str.split('-', expand = True)[0]
    df['earliest_cr_line_year'] = df.earliest_cr_line.str.split('-', expand = True)[1].astype(int)
    df.drop(columns = ['earliest_cr_line'], inplace = True)
    
    # last_credit_pull_d: split month, year
    df['last_credit_pull_d_month'] = df.last_credit_pull_d.str.split('-', expand = True)[0]
    df['last_credit_pull_d_year'] = df.last_credit_pull_d.str.split('-', expand = True)[1].astype(int)
    df.drop(columns = 'last_credit_pull_d', inplace = True)
    
    # sub_grade: remove numeric ratings, group F and G into 'other'
    df['sub_grade_letter'] = df['sub_grade'].str.extract('([A-Z]+)')
    df.loc[df["sub_grade_letter"] == "F", "sub_grade_letter"] = "Other"
    df.loc[df["sub_grade_letter"] == "G", "sub_grade_letter"] = "Other"
    df.drop(columns = 'sub_grade', inplace = True)
    
    # drop 'id' columns
    df.drop(columns = 'id', inplace = True)
    
    return df
    


In [4]:
train = clean(data) # train 
test = clean(validation_data) # kaggle 

In [5]:
# filter for predictors
X = train.drop(columns = ['money_made_inv'])
# filter for response
y = train['money_made_inv']

# 70% of training data becomes the new train set, 30% becomes new test set
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = int(train.shape[0]*.7), random_state = 1)


In [6]:
# numeric predictors
Xtrain_num = Xtrain.select_dtypes(include=np.number)
Xtest_num = Xtest.select_dtypes(include=np.number)

# categorical predictors
Xtrain_cat = Xtrain[['sub_grade_letter', 'term', 'initial_list_status', 'application_type']]
Xtest_cat = Xtest[['sub_grade_letter', 'term', 'initial_list_status', 'application_type']]

# numeric and categorical predictors
Xtrain_num_cat = pd.concat([Xtrain_num, pd.get_dummies(Xtrain_cat)],axis=1)
Xtest_num_cat = pd.concat([Xtest_num, pd.get_dummies(Xtest_cat)],axis=1)


## XGB Boost

In [16]:
#K-fold cross validation to find optimal parameters for XGBoost

start_time = time.time()
param_grid = {'max_depth': [3,4,5],
              'learning_rate': [0.01,0.05,0.1,0.2],
               'reg_lambda':[0,0.01,0.001],
                'n_estimators':[150,175,250,500,1000]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,                             
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)

optimal_params.fit(Xtrain_num_cat, ytrain)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation R-squared = ",optimal_params.best_score_)
print("Time taken = ", (time.time()-start_time)/60, " minutes")




Fitting 5 folds for each of 180 candidates, totalling 900 fits
Optimal parameter values = {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 1000, 'reg_lambda': 0.001}
Optimal cross validation R-squared =  0.9910258596256222
Time taken =  57.10975794792175  minutes


In [20]:
cv_results = pd.DataFrame(optimal_params.cv_results_)
cv_results.sort_values(by = 'mean_test_score', ascending = False).head(5)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_reg_lambda,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
119,55.065593,0.175879,0.018372,0.002253,0.1,4,1000,0.001,"{'learning_rate': 0.1, 'max_depth': 4, 'n_esti...",0.993847,0.99549,0.989534,0.991368,0.98489,0.991026,0.003684,1
87,71.154357,0.512047,0.02118,0.004873,0.05,5,1000,0.0,"{'learning_rate': 0.05, 'max_depth': 5, 'n_est...",0.992719,0.995576,0.989321,0.991544,0.985781,0.990988,0.003294,2
132,67.723359,0.3198,0.020447,0.004498,0.1,5,1000,0.0,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.993041,0.995519,0.989604,0.991143,0.985379,0.990937,0.00341,3
89,67.5428,0.377078,0.019593,0.002623,0.05,5,1000,0.001,"{'learning_rate': 0.05, 'max_depth': 5, 'n_est...",0.992829,0.995603,0.989247,0.991781,0.985169,0.990926,0.003527,4
129,33.079133,0.797477,0.016345,0.003737,0.1,5,500,0.0,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.993032,0.995495,0.989526,0.99111,0.985353,0.990903,0.003417,5


n trees -- flattens out beyond 100

In [18]:
#  result from hour-long tuning 
# learning_rate : 0.1, 'max_depth': 4, 'n_estimators': 1000, 'reg_lambda': 0.001

model_xgb = xgb.XGBRegressor(random_state=1,max_depth=4,
                             n_estimators=1000, learning_rate = 0.1,
                             reg_lambda= 0.001)

model_xgb.fit(Xtrain_num_cat, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=1, reg_alpha=0,
             reg_lambda=0.001, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
# 632??
pred_xgb = model_xgb.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_xgb,ytest))

632.7679326915154

In [23]:
model_xgb = xgb.XGBRegressor(random_state=1,max_depth=5,
                             n_estimators=1000, learning_rate = 0.05,
                             reg_lambda= 0.001)

model_xgb.fit(Xtrain_num_cat, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=1, reg_alpha=0,
             reg_lambda=0.001, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
pred_xgb = model_xgb.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_xgb,ytest))

673.0934735658578

In [12]:
model_xgb = xgb.XGBRegressor(random_state=1,max_depth=6,n_estimators=250,
                                         learning_rate = 0.05,reg_lambda= 0)

model_xgb.fit(Xtrain_num_cat, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=250, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=1, reg_alpha=0,
             reg_lambda=0, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [13]:
pred_xgb = model_xgb.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_xgb,ytest))

750.2334694688906