import dependencies

In [33]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import r2_score
import time
import os
import pickle 
import json

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR

In [2]:
primary_key = "Customer ID"
timestr = time.strftime("%Y%m%d-%H%M%S")
data_path = "../data"
processed_data_path = os.path.join(data_path, "processed_data")
output_path = os.path.join("../output")
model_path = os.path.join(output_path, "model", "artifact")
metrics_path = os.path.join(output_path, "model", "metrics")
predict_path = os.path.join(output_path, "predictions")
if not os.path.exists(model_path):
    os.makedirs(model_path)  
if not os.path.exists(predict_path):
    os.makedirs(predict_path)  
if not os.path.exists(metrics_path):
    os.makedirs(metrics_path) 
    
def save_pickle_model(model_parameters, 
                      model_path, 
                      file_opts):
    # Save the model to disk
    with open(model_path, 
              file_opts) as pickle_out:
        pickle.dump(model_parameters, 
                    pickle_out)

def load_pickle_model(model_path, 
                      file_opts):
    # Load the model from disk
    with open(model_path, 
              file_opts) as pickle_in:
        return pickle.load(pickle_in)
    
def save_dict_to_json(dictionary, path, file_opts):
    # save model metrics
    with open(path, file_opts) as outfile:
        json.dump(dictionary, outfile) 

read data


In [3]:
x_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "x_train.csv"))
x_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "x_val.csv"))
y_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "y_train.csv"))
y_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "y_val.csv"))
x_test = pd.read_csv(os.path.join(processed_data_path, 
                                  "x_test.csv"))
del x_train[primary_key]
del x_val[primary_key]

### RandomForest regressor


In [4]:
# model development

params = {'n_estimators':[100, 500, 1000],
          'min_samples_split':[2, 5, 10],
          'min_samples_leaf':[1,10,100]}

model = RandomForestRegressor()
rf_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=2,
                         scoring='neg_mean_absolute_error')

rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 61.1min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 92.5min finished


GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 500, 1000]},
             scoring='neg_mean_absolute_error', verbose=2)

In [5]:
rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 60.8min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 91.9min finished


GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 500, 1000]},
             scoring='neg_mean_absolute_error', verbose=2)

In [6]:
rf_model.best_params_

{'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 1000}

In [7]:
rf_model.best_estimator_

RandomForestRegressor(min_samples_leaf=10, min_samples_split=5,
                      n_estimators=1000)

In [8]:
# saving model to disk

save_pickle_model(rf_model.best_estimator_, os.path.join(
    model_path, "rf_model.pickle"), "wb") 

In [9]:
# predictions and scoring

rf_train_predicted = rf_model.predict(x_train)
rf_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted)
rf_metric = {}
rf_metric["train"] = rf_score_train
rf_metric["val"] = rf_score_val
save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric

{'train': 99.23613729959098, 'val': 98.95729783354649}

### GradientBoosting Regressor


In [10]:
# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.01, 0.001],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [5, 10],
         'tol': [0.001, 0.0001]}


model = GradientBoostingRegressor()
gb_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='neg_mean_absolute_error')
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 27.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 43.2min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 47.2min finished


GridSearchCV(cv=10, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.001],
                         'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 500], 'tol': [0.001, 0.0001]},
             scoring='neg_mean_absolute_error', verbose=5)

In [11]:
gb_model.best_estimator_

GradientBoostingRegressor(learning_rate=0.01, min_samples_leaf=10,
                          min_samples_split=5, n_estimators=500, tol=0.001)

In [12]:
# saving model to disk

save_pickle_model(gb_model.best_estimator_, os.path.join(
    model_path, "gb_model.pickle"), "wb") 

In [13]:
# predictions and scoring

gbm_train_predicted = gb_model.predict(x_train)
gbm_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                       gbm_train_predicted)
gbm_val_predicted = gb_model.predict(x_val)
gbm_score_val = max(0, 100)*r2_score(y_val, 
                                     gbm_val_predicted)

gbm_metric = {}
gbm_metric["train"] = gbm_score_train
gbm_metric["val"] = gbm_score_val
save_dict_to_json(gbm_metric, os.path.join(
    metrics_path, 'gbm_metric.json'), 'w')
gbm_metric

{'train': 98.98195864725801, 'val': 98.99583106559759}

##### scaling data

<br> Scaling is done to Normalize data so that priority is not given to a particular feature. 
<br> Role of Scaling is mostly important in algorithms that are distance based and require Euclidean Distance. 
<br> Random Forest is a tree-based model and hence does not require feature scaling.

In [14]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
# x_test_scaled = scaler.transform(x_test)

### SGD Regressor

In [15]:
# model development

params = {'penalty':['l1','l2'],
          'tol':[0.001, 0.0001],
          'alpha': [0.0001, 0.00001],
         'max_iter': [500, 1000, 1500]}

model = SGDRegressor(loss='squared_loss')

ln_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='neg_mean_absolute_error')
ln_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   11.6s finished


GridSearchCV(cv=10, estimator=SGDRegressor(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 1e-05],
                         'max_iter': [500, 1000, 1500], 'penalty': ['l1', 'l2'],
                         'tol': [0.001, 0.0001]},
             scoring='neg_mean_absolute_error', verbose=5)

In [16]:
ln_model.best_estimator_

SGDRegressor(alpha=1e-05, max_iter=500)

In [17]:
# saving model to disk

save_pickle_model(ln_model.best_estimator_, os.path.join(
    model_path, "ln_model.pickle"), "wb") 

In [18]:
# predictions and scoring

linear_train_predicted = ln_model.predict(x_train)
linear_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                          linear_train_predicted)
linear_val_predicted = ln_model.predict(x_val)
linear_score_val = max(0, 100)*r2_score(y_val, 
                                        linear_val_predicted)

linear_metric = {}
linear_metric["train"] = linear_score_train
linear_metric["val"] = linear_score_val
save_dict_to_json(linear_metric, os.path.join(
    metrics_path, 'linear_metric.json'), 'w')
linear_metric

{'train': -1009967691358.4381, 'val': -1039974674170.0958}

### SVM SVR

In [19]:
# model development

params = {'kernel':["rbf", "linear", "poly", "sigmoid"],
          'gamma': [1e-4,1e-2,0.0001]}

model = SVR()

svm_svr_model = GridSearchCV(model,
                             cv=10,
                            param_grid=params,
                            n_jobs=-1,
                            verbose=1,
                            scoring='neg_mean_absolute_error')
svm_svr_model.fit(x_train_scaled, 
                  y_train.values.ravel())

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 16.7min finished


GridSearchCV(cv=10, estimator=SVR(), n_jobs=-1,
             param_grid={'gamma': [0.0001, 0.01, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             scoring='neg_mean_absolute_error', verbose=1)

In [20]:
svm_svr_model.best_score_

-12876.509709732687

In [21]:
svm_svr_model.best_estimator_

SVR(gamma=0.0001, kernel='linear')

In [22]:
# saving model to disk

save_pickle_model(svm_svr_model.best_estimator_, os.path.join(
    model_path, "svm_svr_model.pickle"), "wb") 

In [24]:
# predictions and scoring

svm_svr_model_train_predicted = svm_svr_model.predict(x_train)
svm_svr_model_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                       svm_svr_model_train_predicted)
svm_svr_val_predicted = svm_svr_model.predict(x_val)
svm_svr_score_val = max(0, 100)*r2_score(y_val, 
                                     svm_svr_val_predicted)

svm_svr_metric = {}
svm_svr_metric["train"] = svm_svr_model_score_train
svm_svr_metric["val"] = svm_svr_score_val
save_dict_to_json(svm_svr_metric, os.path.join(
    metrics_path, 'svm_svr_metric.json'), 'w')
svm_svr_metric

{'train': -406862972368.9935, 'val': -420750488725.3195}

### Bayesian Ridge

In [34]:
# model development

params = {'tol':[0.001, 0.0001]}

model = BayesianRidge()

bayesian_ridge_model = GridSearchCV(model,
                                    cv=10,
                                    param_grid=params,
                                    n_jobs=-1,
                                    verbose=1,
                                    scoring='neg_mean_absolute_error')
bayesian_ridge_model.fit(x_train_scaled,
                         y_train.values.ravel())

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.3s finished


GridSearchCV(cv=10, estimator=BayesianRidge(), n_jobs=-1,
             param_grid={'tol': [0.001, 0.0001]},
             scoring='neg_mean_absolute_error', verbose=1)

In [35]:
bayesian_ridge_model.best_score_

-3383.3782974799046

In [36]:
bayesian_ridge_model.best_estimator_

BayesianRidge()

In [37]:
# saving model to disk

save_pickle_model(bayesian_ridge_model.best_estimator_, os.path.join(
    model_path, "bayesian_ridge_model.pickle"), "wb") 

In [38]:
# predictions and scoring

bayesian_ridge_model_train_predicted = bayesian_ridge_model.predict(x_train)
bayesian_ridge_model_score_train = max(0, 100)*r2_score(y_train.values.ravel(),
                                                        bayesian_ridge_model_train_predicted)
bayesian_ridge_val_predicted = bayesian_ridge_model.predict(x_val)
bayesian_ridge_score_val = max(0, 100)*r2_score(y_val, 
                                                bayesian_ridge_val_predicted)

bayesian_ridge_metric = {}
bayesian_ridge_metric["train"] = bayesian_ridge_model_score_train
bayesian_ridge_metric["val"] = bayesian_ridge_score_val
save_dict_to_json(svm_svr_metric, os.path.join(
    metrics_path, 'bayesian_ridge_metric.json'), 'w')
bayesian_ridge_metric

{'train': -1010137017869.9077, 'val': -1040150035264.8959}

### Stacking Regressor


In [27]:
gbm_train_predicted_df = pd.DataFrame(gbm_train_predicted, columns = ["gbm_train_predicted"])
rf_train_predicted_df = pd.DataFrame(rf_train_predicted, columns = ["rf_train_predicted"])

In [28]:
grm_rf_stacking_dataframe = pd.merge(gbm_train_predicted_df,  
                                     rf_train_predicted_df, 
                                     how="left",     
                                     left_index=True,   
                                     right_index=True)
stacking_dataframe = pd.merge(grm_rf_stacking_dataframe,  
                             y_train,
                             how="left",     
                             left_index=True,   
                             right_index=True)

In [29]:
stacking_dataframe

Unnamed: 0,gbm_train_predicted,rf_train_predicted,Loan Sanction Amount (USD)
0,17948.606230,17905.083778,22782.33
1,134599.781150,133789.877655,137757.78
2,46224.807679,44950.061005,42838.32
3,20434.014102,20411.445913,21479.30
4,14624.020327,13903.643379,14398.42
...,...,...,...
15612,20712.721359,20742.940205,19224.43
15613,67602.608663,68703.424283,71887.46
15614,67482.518874,67473.035684,67670.32
15615,63445.225563,62010.818955,63758.68


In [30]:
stacking_model = rf_model.fit(x_train,
                              y_train.values.ravel())

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 61.6min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 92.9min finished


In [31]:
# saving model to disk

save_pickle_model(stacking_model.best_estimator_, os.path.join(
    model_path, "stacking_model.pickle"), "wb") 

In [32]:
# predictions and scoring

stacking_train_predicted = stacking_model.predict(x_train)
stacking_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                            stacking_train_predicted)
stacking_val_predicted = stacking_model.predict(x_val)
stacking_score_val = max(0, 100)*r2_score(y_val, 
                                          stacking_val_predicted)
stacking_metric = {}
stacking_metric["train"] = stacking_score_train
stacking_metric["val"] = stacking_score_val
save_dict_to_json(stacking_metric, os.path.join(
    metrics_path, 'stacking_metric.json'), 'w')
stacking_metric

{'train': 99.23631255341981, 'val': 98.95770161015884}

### Test Predictions

In [48]:
x_pk_predict = x_test[primary_key]
x_predict = x_test.drop([primary_key],1)

predicted = stacking_model.predict(x_predict)
predict_df = pd.DataFrame(predicted, 
                          columns = ["Loan Sanction Amount (USD)"])

output_dataframe = pd.merge(x_pk_predict, 
                            predict_df, 
                            how="left", 
                            left_index=True, 
                            right_index=True)

In [49]:
output_dataframe.head()

Unnamed: 0,Customer ID,Loan Sanction Amount (USD)
0,C-26247,102334.092611
1,C-35067,78116.368269
2,C-34590,119994.046524
3,C-16668,71817.563691
4,C-12196,76484.641321


In [50]:
output_dataframe.to_csv(os.path.join(predict_path, 
                                     "stacking_model__%s.csv"%timestr),
                        index=False)

In [None]:
# load models
            
rf_model = load_pickle_model(os.path.join(
    model_path, "rf_model.pickle"), "rb")

gb_model = load_pickle_model(os.path.join(
    model_path, "gb_model.pickle"), "rb")

save_pickle_model(ln_model.best_estimator_, os.path.join(
    model_path, "ln_model.pickle"), "wb") 

rf_train_predicted = rf_model.predict(x_train)
rf_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted)

gbm_train_predicted = gb_model.predict(x_train)
gbm_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                       gbm_train_predicted)
gbm_val_predicted = gb_model.predict(x_val)
gbm_score_val = max(0, 100)*r2_score(y_val, 
                                     gbm_val_predicted)

In [None]:
gbm_metric = {}
gbm_metric["train"] = gbm_score_train
gbm_metric["val"] = gbm_score_val
save_dict_to_json(gbm_metric, os.path.join(
    metrics_path, 'gbm_metric.json'), 'w')
gbm_metric

rf_metric = {}
rf_metric["train"] = rf_score_train
rf_metric["val"] = rf_score_val
save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric