In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import r2_score
import time
import os
import pickle 
import json

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
primary_key = "agent_id"
timestr = time.strftime("%Y%m%d-%H%M%S")
data_path = "../data"
processed_data_path = os.path.join(data_path, "processed_data")
output_path = os.path.join("../output")
model_path = os.path.join(output_path, "model", "artifact")
metrics_path = os.path.join(output_path, "model", "metrics")
feature_importance_path = os.path.join(output_path, "model", "feature_importance")
predict_path = os.path.join(output_path, "predictions")
if not os.path.exists(model_path):
    os.makedirs(model_path)  
if not os.path.exists(predict_path):
    os.makedirs(predict_path)  
if not os.path.exists(metrics_path):
    os.makedirs(metrics_path) 
if not os.path.exists(feature_importance_path):
    os.makedirs(feature_importance_path)
    
def save_pickle_model(model_parameters, 
                      model_path, 
                      file_opts):
    # Save the model to disk
    with open(model_path, 
              file_opts) as pickle_out:
        pickle.dump(model_parameters, 
                    pickle_out)

def load_pickle_model(model_path, 
                      file_opts):
    # Load the model from disk
    with open(model_path, 
              file_opts) as pickle_in:
        return pickle.load(pickle_in)
    
def save_dict_to_json(dictionary, path, file_opts):
    # save model metrics
    with open(path, file_opts) as outfile:
        json.dump(dictionary, outfile) 

read data


In [3]:
x_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "x_train.csv"))
x_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "x_val.csv"))
y_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "y_train.csv"))
y_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "y_val.csv"))
x_test = pd.read_csv(os.path.join(processed_data_path, 
                                  "x_test.csv"))
del x_train[primary_key]
del x_val[primary_key]

In [4]:
x_train.shape

(35692, 92)

### RandomForest regressor


In [5]:
%%time
params = {'n_estimators':[100, 500],
          'min_samples_split':[5, 10, 15],
          'min_samples_leaf':[5, 10, 15],
         'max_depth': [5, 10, 15],
         'max_features': ['auto', 'sqrt']}

model = RandomForestRegressor()
rf_model = GridSearchCV(model,
                         cv=3,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=2,
                         scoring='neg_mean_squared_error')

rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 41.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 178.6min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 359.6min finished


Wall time: 6h 19min 52s


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [5, 10, 15],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [5, 10, 15],
                         'min_samples_split': [5, 10, 15],
                         'n_estimators': [100, 500]},
             scoring='neg_mean_squared_error', verbose=2)

In [6]:
# saving model to disk

save_pickle_model(rf_model.best_estimator_, os.path.join(
    model_path, "rf_tuned_model.pickle"), "wb") 

In [7]:
%%time
# predictions and scoring

rf_model = load_pickle_model(os.path.join(
    model_path, "rf_tuned_model.pickle"), "rb")

# r2_score
rf_train_predicted = rf_model.predict(x_train)
rf_score_train = round(max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted),2)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = round(max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted),2)
# rmse
rmse_train =  round(mean_squared_error(y_train, rf_train_predicted),2)
rmse_val = round(mean_squared_error(y_val, rf_val_predicted),2)

rf_metric = {}
rf_metric["train"]={}
rf_metric["val"]={}
rf_metric["train"]["r2_score"] = rf_score_train
rf_metric["val"]["r2_score"] = rf_score_val
rf_metric["train"]["rmse_train"] = rmse_train
rf_metric["val"]["rmse_val"] = rmse_val

save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric

Wall time: 8.32 s


{'train': {'r2_score': 92.17, 'rmse_train': 2.22},
 'val': {'r2_score': 85.96, 'rmse_val': 3.91}}

### Feature Importance

In [8]:
feats = {} 
for feature, importance in zip(x_train.columns, rf_model.feature_importances_):
    feats[feature] = importance 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
feature_importances = importances.sort_values(by='Gini-importance', ascending=False).reset_index()

feature_importances.to_csv(os.path.join(feature_importance_path, 
                                     "feature_importances__%s.csv"%timestr),
                        index=False)

#feature_importances.head(20)

### Test Predictions

In [9]:
x_pk_predict = x_test[primary_key]
x_predict = x_test.drop([primary_key],1)

predicted = rf_model.predict(x_predict)
predict_df = pd.DataFrame(predicted, 
                          columns = ["business_risk"])

output_dataframe = pd.merge(x_pk_predict, 
                            predict_df, 
                            how="left", 
                            left_index=True, 
                            right_index=True)

In [10]:
output_dataframe.to_csv(os.path.join(predict_path, 
                                     "rf_model_%s.csv"%timestr),
                        index=False)