In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier, StackingRegressor, StackingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
from sklearn.neighbors import KNeighborsRegressor


#Libraries for visualizing trees
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus

# Linear Regression Libraries
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# MARS
from pyearth import Earth
import time as time

import xgboost as xgb
from sklearn.ensemble import VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier, GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier


np.warnings.filterwarnings('ignore')

In [3]:
# original data
data = pd.read_csv('train.csv')
validation_data = pd.read_csv('test.csv')

In [4]:
# data cleaning
state_freqs = data.addr_state.value_counts().to_frame().reset_index().rename(columns={'index' : "state", "addr_state" : "count"})
other_state = state_freqs[state_freqs['count'] < data.shape[0] * .01]['state'].tolist()

def clean(data):
    
    df = data.copy()
    # addr_state: 'other' category
    for obs in range(df.shape[0]):
        if df.loc[obs, 'addr_state'] in other_state:
            df.loc[obs, 'addr_state'] = 'Other'
            
    # earliest_cr_line: split month, year
    df['earliest_cr_line_month'] = df.earliest_cr_line.str.split('-', expand = True)[0]
    df['earliest_cr_line_year'] = df.earliest_cr_line.str.split('-', expand = True)[1].astype(int)
    df.drop(columns = ['earliest_cr_line'], inplace = True)
    
    # last_credit_pull_d: split month, year
    df['last_credit_pull_d_month'] = df.last_credit_pull_d.str.split('-', expand = True)[0]
    df['last_credit_pull_d_year'] = df.last_credit_pull_d.str.split('-', expand = True)[1].astype(int)
    df.drop(columns = 'last_credit_pull_d', inplace = True)
    
    # sub_grade: remove numeric ratings, group F and G into 'other'
    df['sub_grade_letter'] = df['sub_grade'].str.extract('([A-Z]+)')
    df.loc[df["sub_grade_letter"] == "F", "sub_grade_letter"] = "Other"
    df.loc[df["sub_grade_letter"] == "G", "sub_grade_letter"] = "Other"
    df.drop(columns = 'sub_grade', inplace = True)
    
    # drop 'id' columns
    df.drop(columns = 'id', inplace = True)
    
    return df
    


In [5]:
train = clean(data) # train 
test = clean(validation_data) # kaggle 

In [6]:
# filter for predictors
X = train.drop(columns = ['money_made_inv'])
# filter for response
y = train['money_made_inv']

# 70% of training data becomes the new train set, 30% becomes new test set
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = int(train.shape[0]*.7), random_state = 1)


In [7]:
# numeric predictors
Xtrain_num = Xtrain.select_dtypes(include=np.number)
Xtest_num = Xtest.select_dtypes(include=np.number)

# categorical predictors
Xtrain_cat = Xtrain[['sub_grade_letter', 'term', 'initial_list_status', 'application_type']]
Xtest_cat = Xtest[['sub_grade_letter', 'term', 'initial_list_status', 'application_type']]

# numeric and categorical predictors
Xtrain_num_cat = pd.concat([Xtrain_num, pd.get_dummies(Xtrain_cat)],axis=1)
Xtest_num_cat = pd.concat([Xtest_num, pd.get_dummies(Xtest_cat)],axis=1)


# AdaBoost

In [11]:
model = AdaBoostRegressor(random_state=1)
grid = dict()
grid['n_estimators'] = [100, 200, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['base_estimator'] = [DecisionTreeRegressor(max_depth=5), 
                          DecisionTreeRegressor(max_depth=10),
                          DecisionTreeRegressor(max_depth=15)]


# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, 
                           cv=cv, scoring='neg_mean_squared_error')


# execute the grid search
grid_result = grid_search.fit(Xtrain_num_cat, ytrain)


# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']


#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param)

Best: -668132.332214 using {'base_estimator': DecisionTreeRegressor(max_depth=15), 'learning_rate': 1.0, 'n_estimators': 500}


In [14]:
# choose optimal params: learning_rate = .01, n_estimators = 500, DecisionTreeRegressor(max_depth=10)
cv_results = pd.DataFrame(grid_result.cv_results_)
cv_results.sort_values(by = 'mean_test_score', ascending = False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_base_estimator,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
44,35.284385,1.40932,0.404194,0.016356,DecisionTreeRegressor(max_depth=15),1.0,500,{'base_estimator': DecisionTreeRegressor(max_d...,-522747.5,-336900.4,-700613.5,-792247.5,-988152.8,-668132.3,223483.340215,1
42,9.984785,0.654362,0.118767,0.014342,DecisionTreeRegressor(max_depth=15),1.0,100,{'base_estimator': DecisionTreeRegressor(max_d...,-547609.0,-324029.9,-737783.5,-819089.9,-974739.3,-680650.3,225298.076586,2
43,18.527259,0.28458,0.216166,0.027287,DecisionTreeRegressor(max_depth=15),1.0,200,{'base_estimator': DecisionTreeRegressor(max_d...,-533248.0,-346153.8,-722352.8,-804566.2,-1002178.0,-681699.8,225425.902902,3
29,42.134973,0.183349,0.606737,0.060186,DecisionTreeRegressor(max_depth=10),1.0,500,{'base_estimator': DecisionTreeRegressor(max_d...,-533477.0,-356923.1,-691383.4,-787024.7,-1046737.0,-683109.1,233186.285698,4
28,17.519178,0.523208,0.205984,0.021753,DecisionTreeRegressor(max_depth=10),1.0,200,{'base_estimator': DecisionTreeRegressor(max_d...,-533623.5,-352672.3,-675192.1,-808892.8,-1047892.0,-683654.5,236847.51039,5
25,22.323429,1.346329,0.273777,0.078706,DecisionTreeRegressor(max_depth=10),0.1,200,{'base_estimator': DecisionTreeRegressor(max_d...,-546493.4,-422544.0,-721668.8,-785018.6,-950902.0,-685325.4,184455.364434,6
38,72.967605,0.141594,0.486264,0.024437,DecisionTreeRegressor(max_depth=15),0.01,500,{'base_estimator': DecisionTreeRegressor(max_d...,-524343.0,-422554.3,-662487.1,-904809.4,-947860.3,-692410.8,206073.931429,7
26,52.207444,0.443498,0.59804,0.050157,DecisionTreeRegressor(max_depth=10),0.1,500,{'base_estimator': DecisionTreeRegressor(max_d...,-559457.2,-442174.4,-724462.3,-734114.1,-1011281.0,-694297.7,192209.053361,8
24,10.928781,0.281483,0.097102,0.007662,DecisionTreeRegressor(max_depth=10),0.1,100,{'base_estimator': DecisionTreeRegressor(max_d...,-558331.2,-439846.6,-697470.2,-890495.5,-930405.7,-703309.8,188189.686995,9
27,9.105467,0.948967,0.095963,0.008856,DecisionTreeRegressor(max_depth=10),1.0,100,{'base_estimator': DecisionTreeRegressor(max_d...,-519441.0,-349596.6,-769244.1,-863559.2,-1024845.0,-705337.2,241790.209089,10


In [15]:
model_ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
                              n_estimators=500,
                              learning_rate=.01,
                              random_state=1).fit(Xtrain_num_cat, ytrain)

In [16]:
pred_ada = model_ada.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_ada,ytest))


895.938134651224

In [18]:
model_ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=15),
                              n_estimators=200,
                              learning_rate=.0001,
                              random_state=1).fit(Xtrain_num_cat, ytrain)


In [19]:
pred_ada = model_ada.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_ada,ytest))


933.1985968718362

In [20]:
model_ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=15),
                              n_estimators=500,
                              learning_rate=1,
                              random_state=1).fit(Xtrain_num_cat, ytrain)



In [21]:
pred_ada = model_ada.predict(Xtest_num_cat)
np.sqrt(mean_squared_error(pred_ada,ytest))


871.623204011823