# Regression Models

In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier, StackingRegressor, StackingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
from sklearn.neighbors import KNeighborsRegressor


#Libraries for visualizing trees
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus

# Linear Regression Libraries
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# MARS
from pyearth import Earth
import time as time

import xgboost as xgb
from sklearn.ensemble import VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier, GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier


np.warnings.filterwarnings('ignore')

In [35]:
# original data
data = pd.read_csv('train.csv')
validation_data = pd.read_csv('test.csv')

In [36]:
# data cleaning
state_freqs = data.addr_state.value_counts().to_frame().reset_index().rename(columns={'index' : "state", "addr_state" : "count"})
other_state = state_freqs[state_freqs['count'] < data.shape[0] * .01]['state'].tolist()

def clean(data):
    
    df = data.copy()
    # addr_state: 'other' category
    for obs in range(df.shape[0]):
        if df.loc[obs, 'addr_state'] in other_state:
            df.loc[obs, 'addr_state'] = 'Other'
            
    # earliest_cr_line: split month, year
    df['earliest_cr_line_month'] = df.earliest_cr_line.str.split('-', expand = True)[0]
    df['earliest_cr_line_year'] = df.earliest_cr_line.str.split('-', expand = True)[1].astype(int)
    df.drop(columns = ['earliest_cr_line'], inplace = True)
    
    # last_credit_pull_d: split month, year
    df['last_credit_pull_d_month'] = df.last_credit_pull_d.str.split('-', expand = True)[0]
    df['last_credit_pull_d_year'] = df.last_credit_pull_d.str.split('-', expand = True)[1].astype(int)
    df.drop(columns = 'last_credit_pull_d', inplace = True)
    
    # sub_grade: remove numeric ratings, group F and G into 'other'
    df['sub_grade_letter'] = df['sub_grade'].str.extract('([A-Z]+)')
    df.loc[df["sub_grade_letter"] == "F", "sub_grade_letter"] = "Other"
    df.loc[df["sub_grade_letter"] == "G", "sub_grade_letter"] = "Other"
    df.drop(columns = 'sub_grade', inplace = True)
    
    # drop 'id' columns
    df.drop(columns = 'id', inplace = True)
    
    return df
    


In [37]:
train = clean(data) # train 
test = clean(validation_data) # kaggle 

In [38]:
# filter for predictors
X = train.drop(columns = ['money_made_inv'])
# filter for response
y = train['money_made_inv']

# 70% of training data becomes the new train set, 30% becomes new test set
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = int(train.shape[0]*.7), random_state = 1)


In [39]:
# numeric predictors
Xtrain_num = Xtrain.select_dtypes(include=np.number)
Xtest_num = Xtest.select_dtypes(include=np.number)

# categorical predictors
Xtrain_cat = Xtrain[['sub_grade_letter', 'term', 'initial_list_status', 'application_type']]
Xtest_cat = Xtest[['sub_grade_letter', 'term', 'initial_list_status', 'application_type']]

# numeric and categorical predictors
Xtrain_num_cat = pd.concat([Xtrain_num, pd.get_dummies(Xtrain_cat)],axis=1)
Xtest_num_cat = pd.concat([Xtest_num, pd.get_dummies(Xtest_cat)],axis=1)


## Base Models

### 1a) MARS

In [40]:
model_mars = Earth(max_terms=500, max_degree=3).fit(Xtrain_num_cat, ytrain)

print(model_mars.summary())

Earth Model
-------------------------------------------------------------------------------------
Basis Function                                                 Pruned  Coefficient   
-------------------------------------------------------------------------------------
(Intercept)                                                    No      126530        
h(out_prncp_inv-13815.4)                                       No      9.73563       
h(13815.4-out_prncp_inv)                                       No      -9.15835      
term_36 months                                                 No      -100820       
h(out_prncp_inv-13516.5)*term_36 months                        No      -7.90078      
h(13516.5-out_prncp_inv)*term_36 months                        No      7.459         
h(out_prncp_inv-419.63)                                        No      -10.941       
h(419.63-out_prncp_inv)                                        Yes     None          
loan_amnt*h(out_prncp_inv-419.63)         

In [41]:
# rmse - 590.964
pred_mars = model_mars.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_mars,ytest))

590.9646712607772

### 1b) Random Forest

In [42]:
rf_model = RandomForestRegressor(n_estimators=450,
                                 random_state=1,
                                 max_features = 28, 
                                 max_depth = 13,
                                 n_jobs=-1).fit(Xtrain_num_cat, ytrain)


In [43]:
pred_rf = rf_model.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_rf,ytest))

857.9485511112829

### 1c) XGB Boost

In [44]:
model_xgb = xgb.XGBRegressor(random_state=1,max_depth=4,
                             n_estimators=1000, learning_rate = 0.1,
                             reg_lambda= 0.001)

model_xgb.fit(Xtrain_num_cat, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=1, reg_alpha=0,
             reg_lambda=0.001, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
pred_xgb = model_xgb.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_xgb,ytest))

632.7679326915154

### Model #1: Stacking Ensemble

In [46]:
# mars, xgb, rf = 497.28828741081094

en = StackingRegressor(estimators = [('mars', model_mars), 
                                     ('xgb', model_xgb),
                                     ('rf', rf_model)],
                       final_estimator=Earth(max_degree = 1, max_terms = 500),
                       cv = KFold(n_splits = 5, shuffle = True, random_state=1))


en.fit(Xtrain_num_cat, ytrain)


print("Ensemble model RMSE = ", np.sqrt(mean_squared_error(en.predict(Xtest_num_cat),ytest)))



Ensemble model RMSE =  497.28828741081094


In [47]:
test_num = test.select_dtypes(include=np.number)

test_cat = test[['sub_grade_letter', 'term', 
                 'initial_list_status', 'application_type']]

validation_df = pd.concat([test_num, pd.get_dummies(test_cat)],axis=1)

predictions = pd.DataFrame(en.predict(validation_df), columns = ['Predicted'])

ids = validation_data['id'].to_frame()

kaggle_predictions = pd.concat([ids, predictions], axis = 1)

kaggle_predictions.head()


Unnamed: 0,id,Predicted
0,1,-18.536869
1,2,-5.656887
2,3,-4464.192702
3,4,-75.245004
4,5,-10.361914


In [48]:
kaggle_predictions.to_csv('stacking-pred.csv', index = False)

### Model #2: Weighted Average Ensemble

In [49]:
m1 = model_mars.fit(Xtrain_num_cat, ytrain)

m2 = model_xgb.fit(Xtrain_num_cat, ytrain)

m3 = rf_model.fit(Xtrain_num_cat, ytrain)


In [50]:
def weight_preds(m1, m2, m3, testing_preds):
    pred = .5*m1.predict(testing_preds) + .4*m2.predict(testing_preds) + .1*m3.predict(testing_preds) 
    return pred


In [51]:
pred = weight_preds(m1, m2, m3, Xtest_num_cat)

In [52]:
# weighted average
np.sqrt(mean_squared_error(pred, ytest))

547.9553200921371

In [53]:
test_num = test.select_dtypes(include=np.number)

test_cat = test[['sub_grade_letter', 'term', 
                 'initial_list_status', 'application_type']]

validation_df = pd.concat([test_num, pd.get_dummies(test_cat)],axis=1)

weighted_preds = weight_preds(m1, m2, m3, validation_df)

In [54]:
test_num = test.select_dtypes(include=np.number)

test_cat = test[['sub_grade_letter', 'term', 
                 'initial_list_status', 'application_type']]

validation_df = pd.concat([test_num, pd.get_dummies(test_cat)],axis=1)

predictions = pd.DataFrame(weighted_preds, columns = ['Predicted'])

ids = validation_data['id'].to_frame()

kaggle_predictions_wa = pd.concat([ids, predictions], axis = 1)

kaggle_predictions_wa



Unnamed: 0,id,Predicted
0,1,-0.837488
1,2,6.298033
2,3,-4206.155375
3,4,-32.640288
4,5,3.720953
...,...,...
3813,3814,-33599.427298
3814,3815,-29.276021
3815,3816,-8301.479755
3816,3817,-3746.519640


In [55]:
kaggle_predictions_wa.to_csv('weighted-pred-1.csv', index = False)