import dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import r2_score
import time
import os
import pickle 
import json

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR

In [69]:
primary_key = "Customer ID"
timestr = time.strftime("%Y%m%d-%H%M%S")
data_path = "../data"
processed_data_path = os.path.join(data_path, "processed_data")
output_path = os.path.join("../output")
model_path = os.path.join(output_path, "model", "artifact")
metrics_path = os.path.join(output_path, "model", "metrics")
predict_path = os.path.join(output_path, "predictions")
if not os.path.exists(model_path):
    os.makedirs(model_path)  
if not os.path.exists(predict_path):
    os.makedirs(predict_path)  
if not os.path.exists(metrics_path):
    os.makedirs(metrics_path) 
    
def save_pickle_model(model_parameters, 
                      model_path, 
                      file_opts):
    # Save the model to disk
    with open(model_path, 
              file_opts) as pickle_out:
        pickle.dump(model_parameters, 
                    pickle_out)

def load_pickle_model(model_path, 
                      file_opts):
    # Load the model from disk
    with open(model_path, 
              file_opts) as pickle_in:
        return pickle.load(pickle_in)
    
def save_dict_to_json(dictionary, path, file_opts):
    # save model metrics
    with open(path, file_opts) as outfile:
        json.dump(dictionary, outfile) 

read data


In [70]:
x_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "x_train.csv"))
x_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "x_val.csv"))
y_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "y_train.csv"))
y_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "y_val.csv"))
x_test = pd.read_csv(os.path.join(processed_data_path, 
                                  "x_test.csv"))
del x_train[primary_key]
del x_val[primary_key]

### RandomForest regressor


In [4]:
# model development

params = {'n_estimators':[100, 500, 1000],
          'min_samples_split':[2, 5, 10],
          'min_samples_leaf':[1,10,100]}

model = RandomForestRegressor()
rf_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=2,
                         scoring='neg_mean_absolute_error')

rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 23.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 105.0min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 158.8min finished


GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 500, 1000]},
             scoring='neg_mean_absolute_error', verbose=2)

In [45]:
%matplotlib inline

feats = {} 
for feature, importance in zip(x_train.columns, rf_model.feature_importances_):
    feats[feature] = importance 

# importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance')

Unnamed: 0,Gini-importance
Dependents_0,8e-05
Type of Employment_Drivers,9.8e-05
Type of Employment_Managers,0.000103
Type of Employment_Accountants,0.000149
Type of Employment_Core staff,0.000192
Type of Employment_Sales staff,0.000222
Dependents_4.0,0.000301
Location_Rural,0.000358
No. of Defaults,0.000458
Type of Employment_Laborers,0.000489


In [None]:
save_dict_to_json(feats, os.path.join(
    metrics_path, 'feature_importances.json'), 'w')
feature_importances

In [7]:
rf_model.best_params_

{'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 1000}

In [8]:
rf_model.best_estimator_

RandomForestRegressor(min_samples_leaf=10, min_samples_split=5,
                      n_estimators=1000)

In [9]:
# saving model to disk

save_pickle_model(rf_model.best_estimator_, os.path.join(
    model_path, "rf_model.pickle"), "wb") 

In [10]:
# predictions and scoring

rf_train_predicted = rf_model.predict(x_train)
rf_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted)
rf_metric = {}
rf_metric["train"] = rf_score_train
rf_metric["val"] = rf_score_val
save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric

{'train': 85.4665563620116, 'val': 79.71147529172381}

In [71]:
keep_features_list = ["Loan Amount Request (USD)", "Credit Score", "Co-Applicant", "Profession_Pensioner", "Property Age"]

In [72]:
x_train_imp = x_train[keep_features_list]
x_val_imp  = x_train[keep_features_list]

In [73]:
# model development

rf_model_imp = RandomForestRegressor(min_samples_leaf= 10, min_samples_split= 5, n_estimators= 1000)
rf_model_imp.fit(x_train,
             y_train.values.ravel())

RandomForestRegressor(min_samples_leaf=10, min_samples_split=5,
                      n_estimators=1000)

In [74]:
# saving model to disk

save_pickle_model(rf_model_imp, os.path.join(
    model_path, "rf_model_imp.pickle"), "wb") 

In [75]:
# predictions and scoring

rf_imp_train_predicted = rf_model_imp.predict(x_train)
rf_imp_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_imp_train_predicted)
rf_imp_val_predicted = rf_model_imp.predict(x_val)
rf_imp_score_val = max(0, 100)*r2_score(y_val, 
                                    rf_imp_val_predicted)
rf_imp_metric = {}
rf_imp_metric["train"] = rf_imp_score_train
rf_imp_metric["val"] = rf_imp_score_val
save_dict_to_json(rf_imp_metric, os.path.join(
    metrics_path, 'rf_imp_metric.json'), 'w')
rf_imp_metric

{'train': 85.47045966986609, 'val': 79.69241991509338}

### GradientBoosting Regressor


In [11]:
# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.01, 0.001],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [5, 10],
         'tol': [0.001, 0.0001]}


model = GradientBoostingRegressor()
gb_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='neg_mean_absolute_error')
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   41.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 40.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 64.0min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 69.8min finished


GridSearchCV(cv=10, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.001],
                         'min_samples_leaf': [1, 10, 100],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 500], 'tol': [0.001, 0.0001]},
             scoring='neg_mean_absolute_error', verbose=5)

In [12]:
gb_model.best_estimator_

GradientBoostingRegressor(learning_rate=0.01, min_samples_leaf=10,
                          min_samples_split=5, n_estimators=500, tol=0.001)

In [13]:
# saving model to disk

save_pickle_model(gb_model.best_estimator_, os.path.join(
    model_path, "gb_model.pickle"), "wb") 

In [14]:
# predictions and scoring

gbm_train_predicted = gb_model.predict(x_train)
gbm_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                       gbm_train_predicted)
gbm_val_predicted = gb_model.predict(x_val)
gbm_score_val = max(0, 100)*r2_score(y_val, 
                                     gbm_val_predicted)

gbm_metric = {}
gbm_metric["train"] = gbm_score_train
gbm_metric["val"] = gbm_score_val
save_dict_to_json(gbm_metric, os.path.join(
    metrics_path, 'gbm_metric.json'), 'w')
gbm_metric

{'train': 78.17347013415583, 'val': 78.2515115460802}

##### scaling data

<br> Scaling is done to Normalize data so that priority is not given to a particular feature. 
<br> Role of Scaling is mostly important in algorithms that are distance based and require Euclidean Distance. 
<br> Random Forest is a tree-based model and hence does not require feature scaling.

In [15]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
# x_test_scaled = scaler.transform(x_test)

### SGD Regressor

In [16]:
# model development

params = {'penalty':['l1','l2'],
          'tol':[0.001, 0.0001],
          'alpha': [0.0001, 0.00001],
         'max_iter': [500, 1000, 1500]}

model = SGDRegressor(loss='squared_loss')

ln_model = GridSearchCV(model,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=5,
                         scoring='neg_mean_absolute_error')
ln_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   15.1s finished


GridSearchCV(cv=10, estimator=SGDRegressor(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 1e-05],
                         'max_iter': [500, 1000, 1500], 'penalty': ['l1', 'l2'],
                         'tol': [0.001, 0.0001]},
             scoring='neg_mean_absolute_error', verbose=5)

In [17]:
ln_model.best_estimator_

SGDRegressor(alpha=1e-05, max_iter=500, penalty='l1')

In [18]:
# saving model to disk

save_pickle_model(ln_model.best_estimator_, os.path.join(
    model_path, "ln_model.pickle"), "wb") 

In [19]:
# predictions and scoring

linear_train_predicted = ln_model.predict(x_train)
linear_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                          linear_train_predicted)
linear_val_predicted = ln_model.predict(x_val)
linear_score_val = max(0, 100)*r2_score(y_val, 
                                        linear_val_predicted)

linear_metric = {}
linear_metric["train"] = linear_score_train
linear_metric["val"] = linear_score_val
save_dict_to_json(linear_metric, os.path.join(
    metrics_path, 'linear_metric.json'), 'w')
linear_metric

{'train': -478464113507.5909, 'val': -480778696440.7903}

### Bayesian Ridge

In [20]:
# model development

params = {'tol':[0.001, 0.0001]}

model = BayesianRidge()

bayesian_ridge_model = GridSearchCV(model,
                                    cv=10,
                                    param_grid=params,
                                    n_jobs=-1,
                                    verbose=1,
                                    scoring='neg_mean_absolute_error')
bayesian_ridge_model.fit(x_train_scaled,
                         y_train.values.ravel())

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.0s finished


GridSearchCV(cv=10, estimator=BayesianRidge(), n_jobs=-1,
             param_grid={'tol': [0.001, 0.0001]},
             scoring='neg_mean_absolute_error', verbose=1)

In [21]:
bayesian_ridge_model.best_score_

-17937.035053770243

In [22]:
bayesian_ridge_model.best_estimator_

BayesianRidge(tol=0.0001)

In [23]:
# saving model to disk

save_pickle_model(bayesian_ridge_model.best_estimator_, os.path.join(
    model_path, "bayesian_ridge_model.pickle"), "wb") 

In [25]:
# predictions and scoring

bayesian_ridge_model_train_predicted = bayesian_ridge_model.predict(x_train)
bayesian_ridge_model_score_train = max(0, 100)*r2_score(y_train.values.ravel(),
                                                        bayesian_ridge_model_train_predicted)
bayesian_ridge_val_predicted = bayesian_ridge_model.predict(x_val)
bayesian_ridge_score_val = max(0, 100)*r2_score(y_val, 
                                                bayesian_ridge_val_predicted)

bayesian_ridge_metric = {}
bayesian_ridge_metric["train"] = bayesian_ridge_model_score_train
bayesian_ridge_metric["val"] = bayesian_ridge_score_val
save_dict_to_json(bayesian_ridge_metric, os.path.join(
    metrics_path, 'bayesian_ridge_metric.json'), 'w')
bayesian_ridge_metric

{'train': -476641246925.34735, 'val': -478941853894.1226}

### Stacking Regressor


In [26]:
gbm_train_predicted_df = pd.DataFrame(gbm_train_predicted, columns = ["gbm_train_predicted"])
rf_train_predicted_df = pd.DataFrame(rf_train_predicted, columns = ["rf_train_predicted"])

In [27]:
grm_rf_stacking_dataframe = pd.merge(gbm_train_predicted_df,  
                                     rf_train_predicted_df, 
                                     how="left",     
                                     left_index=True,   
                                     right_index=True)
stacking_dataframe = pd.merge(grm_rf_stacking_dataframe,  
                             y_train,
                             how="left",     
                             left_index=True,   
                             right_index=True)

In [28]:
stacking_dataframe

Unnamed: 0,gbm_train_predicted,rf_train_predicted,Loan Sanction Amount (USD)
0,17759.598989,12671.998824,0.00
1,110886.654357,111344.440697,132201.10
2,76471.153147,69199.566150,0.00
3,159598.188635,153349.831000,180702.22
4,20169.904298,16082.726252,16574.49
...,...,...,...
21459,98759.112017,105094.890264,109830.53
21460,45101.792184,44946.765169,61641.22
21461,25660.246981,25535.930691,28492.34
21462,30641.222172,29300.998380,35506.31


In [30]:
rf_model = load_pickle_model(os.path.join(
    model_path, "rf_model.pickle"), "rb")

In [31]:
stacking_model = rf_model.fit(x_train,
                              y_train.values.ravel())

In [33]:
# saving model to disk

save_pickle_model(stacking_model, os.path.join(
    model_path, "stacking_model.pickle"), "wb") 

In [34]:
# predictions and scoring

stacking_train_predicted = stacking_model.predict(x_train)
stacking_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                            stacking_train_predicted)
stacking_val_predicted = stacking_model.predict(x_val)
stacking_score_val = max(0, 100)*r2_score(y_val, 
                                          stacking_val_predicted)
stacking_metric = {}
stacking_metric["train"] = stacking_score_train
stacking_metric["val"] = stacking_score_val
save_dict_to_json(stacking_metric, os.path.join(
    metrics_path, 'stacking_metric.json'), 'w')
stacking_metric

{'train': 85.47872234095217, 'val': 79.70085794976289}

In [None]:
ln_model = load_pickle_model(os.path.join(
    model_path, "ln_model.pickle"), "rb")

In [None]:
stacking_model2 = ln_model.fit(x_train,
                              y_train.values.ravel())

In [None]:
# saving model to disk

save_pickle_model(stacking_model2, os.path.join(
    model_path, "stacking_model2.pickle"), "wb")

In [None]:
# predictions and scoring

stacking_train_predicted = stacking_model2.predict(x_train)
stacking_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                            stacking_train_predicted)
stacking_val_predicted = stacking_model2.predict(x_val)
stacking_score_val = max(0, 100)*r2_score(y_val, 
                                          stacking_val_predicted)
stacking_metric = {}
stacking_metric["train"] = stacking_score_train
stacking_metric["val"] = stacking_score_val
save_dict_to_json(stacking_metric2, os.path.join(
    metrics_path, 'stacking_metric2.json'), 'w')
stacking_metric

### Test Predictions

In [76]:
x_pk_predict = x_test[primary_key]
x_predict = x_test.drop([primary_key],1)

predicted = rf_model_imp.predict(x_predict)
predict_df = pd.DataFrame(predicted, 
                          columns = ["Loan Sanction Amount (USD)"])

output_dataframe = pd.merge(x_pk_predict, 
                            predict_df, 
                            how="left", 
                            left_index=True, 
                            right_index=True)

In [77]:
output_dataframe.head()

Unnamed: 0,Customer ID,Loan Sanction Amount (USD)
0,C-26247,85209.643363
1,C-35067,61898.173221
2,C-34590,2723.485216
3,C-16668,54318.096853
4,C-12196,72719.366483


In [78]:
output_dataframe.to_csv(os.path.join(predict_path, 
                                     "rf_model_imp__%s.csv"%timestr),
                        index=False)

In [None]:
# load models
            
rf_model = load_pickle_model(os.path.join(
    model_path, "rf_model.pickle"), "rb")

gb_model = load_pickle_model(os.path.join(
    model_path, "gb_model.pickle"), "rb")

save_pickle_model(ln_model.best_estimator_, os.path.join(
    model_path, "ln_model.pickle"), "wb") 

rf_train_predicted = rf_model.predict(x_train)
rf_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted)

gbm_train_predicted = gb_model.predict(x_train)
gbm_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                       gbm_train_predicted)
gbm_val_predicted = gb_model.predict(x_val)
gbm_score_val = max(0, 100)*r2_score(y_val, 
                                     gbm_val_predicted)

In [None]:
gbm_metric = {}
gbm_metric["train"] = gbm_score_train
gbm_metric["val"] = gbm_score_val
save_dict_to_json(gbm_metric, os.path.join(
    metrics_path, 'gbm_metric.json'), 'w')
gbm_metric

rf_metric = {}
rf_metric["train"] = rf_score_train
rf_metric["val"] = rf_score_val
save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric