import dependencies

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import r2_score
import time
import os
import pickle 
import json

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVR

In [3]:
primary_key = "agent_id"
timestr = time.strftime("%Y%m%d-%H%M%S")
data_path = "../data"
processed_data_path = os.path.join(data_path, "processed_data")
output_path = os.path.join("../output")
model_path = os.path.join(output_path, "model", "artifact")
metrics_path = os.path.join(output_path, "model", "metrics")
feature_importance_path = os.path.join(output_path, "model", "feature_importance")
predict_path = os.path.join(output_path, "predictions")
if not os.path.exists(model_path):
    os.makedirs(model_path)  
if not os.path.exists(predict_path):
    os.makedirs(predict_path)  
if not os.path.exists(metrics_path):
    os.makedirs(metrics_path) 
if not os.path.exists(feature_importance_path):
    os.makedirs(feature_importance_path)
    
def save_pickle_model(model_parameters, 
                      model_path, 
                      file_opts):
    # Save the model to disk
    with open(model_path, 
              file_opts) as pickle_out:
        pickle.dump(model_parameters, 
                    pickle_out)

def load_pickle_model(model_path, 
                      file_opts):
    # Load the model from disk
    with open(model_path, 
              file_opts) as pickle_in:
        return pickle.load(pickle_in)
    
def save_dict_to_json(dictionary, path, file_opts):
    # save model metrics
    with open(path, file_opts) as outfile:
        json.dump(dictionary, outfile) 

read data


In [4]:
x_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "x_train.csv"))
x_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "x_val.csv"))
y_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "y_train.csv"))
y_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "y_val.csv"))
x_test = pd.read_csv(os.path.join(processed_data_path, 
                                  "x_test.csv"))
del x_train[primary_key]
del x_val[primary_key]

### RandomForest regressor


In [5]:
# model development

rf_model = RandomForestRegressor()
rf_model.fit(x_train,
             y_train.values.ravel())

RandomForestRegressor()

In [7]:
# rf_model.best_estimator_

In [10]:
# saving model to disk

save_pickle_model(rf_model, os.path.join(
    model_path, "rf_model.pickle"), "wb") 

In [24]:
# predictions and scoring

rf_model = load_pickle_model(os.path.join(
    model_path, "rf_model.pickle"), "rb")

# r2_score
rf_train_predicted = rf_model.predict(x_train)
rf_score_train = round(max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted),2)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = round(max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted),2)
# rmse
rmse_train =  round(mean_squared_error(y_train, rf_train_predicted),2)
rmse_val = round(mean_squared_error(y_val, rf_val_predicted),2)

rf_metric = {}
rf_metric["train"]={}
rf_metric["val"]={}
rf_metric["train"]["r2_score"] = rf_score_train
rf_metric["val"]["r2_score"] = rf_score_val
rf_metric["train"]["rmse_train"] = rmse_train
rf_metric["val"]["rmse_val"] = rmse_val

save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric

{'train': {'r2_score': 98.1, 'rmse_train': 0.54},
 'val': {'r2_score': 86.55, 'rmse_val': 3.74}}

### Feature Importance

In [83]:
feats = {} 
for feature, importance in zip(x_train.columns, rf_model.feature_importances_):
    feats[feature] = importance 

# importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
feature_importances = importances.sort_values(by='Gini-importance')

feature_importances.to_csv(os.path.join(feature_importance_path, 
                                     "feature_importances__%s.csv"%timestr),
                        index=False)

#feature_importances

### Random Forest: Using features with more than 1% Importance

In [73]:
keep_features_list = ["Loan Amount Request (USD)", "Credit Score", "Co-Applicant", "Profession_Pensioner", "Property Age"]
x_train_imp = x_train[keep_features_list]
x_val_imp  = x_train[keep_features_list]

# model development

rf_model_imp = RandomForestRegressor(min_samples_leaf= 10, min_samples_split= 5, n_estimators= 1000)
rf_model_imp.fit(x_train,
             y_train.values.ravel())

RandomForestRegressor(min_samples_leaf=10, min_samples_split=5,
                      n_estimators=1000)

In [74]:
# saving model to disk

save_pickle_model(rf_model_imp, os.path.join(
    model_path, "rf_model_imp.pickle"), "wb") 

In [75]:
# predictions and scoring

rf_model_imp = load_pickle_model(os.path.join(
    model_path, "rf_model_imp.pickle"), "rb")

rf_imp_train_predicted = rf_model_imp.predict(x_train)
rf_imp_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_imp_train_predicted)
rf_imp_val_predicted = rf_model_imp.predict(x_val)
rf_imp_score_val = max(0, 100)*r2_score(y_val, 
                                    rf_imp_val_predicted)
rf_imp_metric = {}
rf_imp_metric["train"] = rf_imp_score_train
rf_imp_metric["val"] = rf_imp_score_val
save_dict_to_json(rf_imp_metric, os.path.join(
    metrics_path, 'rf_imp_metric.json'), 'w')
rf_imp_metric

{'train': 85.47045966986609, 'val': 79.69241991509338}

### Test Predictions

In [76]:
x_pk_predict = x_test[primary_key]
x_predict = x_test.drop([primary_key],1)

predicted = rf_model_imp.predict(x_predict)
predict_df = pd.DataFrame(predicted, 
                          columns = ["Loan Sanction Amount (USD)"])

output_dataframe = pd.merge(x_pk_predict, 
                            predict_df, 
                            how="left", 
                            left_index=True, 
                            right_index=True)

In [78]:
output_dataframe.to_csv(os.path.join(predict_path, 
                                     "rf_model_imp__%s.csv"%timestr),
                        index=False)