import dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import r2_score
import time
import os
import pickle 
import json

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVR

In [2]:
primary_key = "agent_id"
timestr = time.strftime("%Y%m%d-%H%M%S")
data_path = "../data"
processed_data_path = os.path.join(data_path, "processed_data")
output_path = os.path.join("../output")
model_path = os.path.join(output_path, "model", "artifact")
metrics_path = os.path.join(output_path, "model", "metrics")
feature_importance_path = os.path.join(output_path, "model", "feature_importance")
predict_path = os.path.join(output_path, "predictions")
if not os.path.exists(model_path):
    os.makedirs(model_path)  
if not os.path.exists(predict_path):
    os.makedirs(predict_path)  
if not os.path.exists(metrics_path):
    os.makedirs(metrics_path) 
if not os.path.exists(feature_importance_path):
    os.makedirs(feature_importance_path)
    
def save_pickle_model(model_parameters, 
                      model_path, 
                      file_opts):
    # Save the model to disk
    with open(model_path, 
              file_opts) as pickle_out:
        pickle.dump(model_parameters, 
                    pickle_out)

def load_pickle_model(model_path, 
                      file_opts):
    # Load the model from disk
    with open(model_path, 
              file_opts) as pickle_in:
        return pickle.load(pickle_in)
    
def save_dict_to_json(dictionary, path, file_opts):
    # save model metrics
    with open(path, file_opts) as outfile:
        json.dump(dictionary, outfile) 

read data


In [3]:
x_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "x_train.csv"))
x_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "x_val.csv"))
y_train = pd.read_csv(os.path.join(processed_data_path, 
                                   "y_train.csv"))
y_val = pd.read_csv(os.path.join(processed_data_path, 
                                 "y_val.csv"))
x_test = pd.read_csv(os.path.join(processed_data_path, 
                                  "x_test.csv"))
del x_train[primary_key]
del x_val[primary_key]

In [5]:
x_train.shape

(35692, 92)

In [None]:
params = {'n_estimators':[500, 1000],
          'min_samples_split':[2,5],
          'min_samples_leaf':[2, 5],
         'max_depth': [15, 20, 25],
         'max_features': ['auto']}

model = RandomForestRegressor()
rf_model = GridSearchCV(model,
                         cv=3,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=2,
                         scoring='neg_mean_squared_error')

rf_model.fit(x_train,
             y_train.values.ravel())

In [None]:
rf_model.best_params_

In [None]:
rf_model.best_estimator_

In [None]:
# saving model to disk

save_pickle_model(rf_model.best_estimator_, os.path.join(
    model_path, "rf_tuned_model.pickle"), "wb") 

In [None]:
# predictions and scoring

rf_model = load_pickle_model(os.path.join(
    model_path, "rf_tuned_model.pickle"), "rb")

# r2_score
rf_train_predicted = rf_model.predict(x_train)
rf_score_train = round(max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted),2)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = round(max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted),2)
# rmse
rmse_train =  round(mean_squared_error(y_train, rf_train_predicted),2)
rmse_val = round(mean_squared_error(y_val, rf_val_predicted),2)

rf_metric = {}
rf_metric["train"]={}
rf_metric["val"]={}
rf_metric["train"]["r2_score"] = rf_score_train
rf_metric["val"]["r2_score"] = rf_score_val
rf_metric["train"]["rmse_train"] = rmse_train
rf_metric["val"]["rmse_val"] = rmse_val

save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric

### RandomForest regressor


In [None]:
# model development

rf_model = RandomForestRegressor()
rf_model.fit(x_train,
             y_train.values.ravel())

In [None]:
# rf_model.best_estimator_

In [None]:
# saving model to disk

save_pickle_model(rf_model, os.path.join(
    model_path, "rf_model.pickle"), "wb") 

In [None]:
# predictions and scoring

rf_model = load_pickle_model(os.path.join(
    model_path, "rf_model.pickle"), "rb")

# r2_score
rf_train_predicted = rf_model.predict(x_train)
rf_score_train = round(max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_train_predicted),2)
rf_val_predicted = rf_model.predict(x_val)
rf_score_val = round(max(0, 100)*r2_score(y_val, 
                                    rf_val_predicted),2)
# rmse
rmse_train =  round(mean_squared_error(y_train, rf_train_predicted),2)
rmse_val = round(mean_squared_error(y_val, rf_val_predicted),2)

rf_metric = {}
rf_metric["train"]={}
rf_metric["val"]={}
rf_metric["train"]["r2_score"] = rf_score_train
rf_metric["val"]["r2_score"] = rf_score_val
rf_metric["train"]["rmse_train"] = rmse_train
rf_metric["val"]["rmse_val"] = rmse_val

save_dict_to_json(rf_metric, os.path.join(
    metrics_path, 'rf_metric.json'), 'w')
rf_metric

### Feature Importance

In [None]:
feats = {} 
for feature, importance in zip(x_train.columns, rf_model.feature_importances_):
    feats[feature] = importance 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
feature_importances = importances.sort_values(by='Gini-importance', ascending=False).reset_index()

feature_importances.to_csv(os.path.join(feature_importance_path, 
                                     "feature_importances__%s.csv"%timestr),
                        index=False)

#feature_importances.head(20)

### Random Forest: Using features with more than 1% Importance

In [None]:
keep_features_list = feature_importances["index"].head(10)
keep_features_list
x_train_imp = x_train[keep_features_list]
x_val_imp  = x_train[keep_features_list]

# model development

rf_model_imp = RandomForestRegressor()
rf_model_imp.fit(x_train,
             y_train.values.ravel())

In [None]:
# saving model to disk

save_pickle_model(rf_model_imp, os.path.join(
    model_path, "rf_model_imp.pickle"), "wb") 

In [None]:
# predictions and scoring

rf_model_imp = load_pickle_model(os.path.join(
    model_path, "rf_model_imp.pickle"), "rb")

rf_imp_train_predicted = rf_model_imp.predict(x_train)
rf_imp_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_imp_train_predicted)
rf_imp_val_predicted = rf_model_imp.predict(x_val)
rf_imp_score_val = max(0, 100)*r2_score(y_val, 
                                    rf_imp_val_predicted)

# rmse
rf_imp_rmse_train =  round(mean_squared_error(y_train, rf_imp_train_predicted),2)
rf_imp_rmse_val = round(mean_squared_error(y_val, rf_imp_val_predicted),2)

rf_imp_metric = {}
rf_imp_metric["train"]={}
rf_imp_metric["val"]={}
rf_imp_metric["train"]["r2_score"] = rf_imp_score_train
rf_imp_metric["val"]["r2_score"] = rf_imp_score_val
rf_imp_metric["train"]["rmse_train"] = rf_imp_rmse_train
rf_imp_metric["val"]["rmse_val"] = rf_imp_rmse_val


save_dict_to_json(rf_imp_metric, os.path.join(
    metrics_path, 'rf_imp_metric.json'), 'w')
rf_imp_metric

### RF_PCA

In [None]:
x_train.shape

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)
test_pk = x_test[primary_key]
del x_test[primary_key]
x_test = pca.transform(x_test)

In [None]:
# np.save("../data/processed_data/x_train.npy", x_train)
# np.save("../data/processed_data/x_test.npy", x_test)
# np.save("../data/processed_data/x_val.npy", x_val)
# np.save("../data/processed_data/y_train.npy", y_train)
# np.save("../data/processed_data/y_test.npy", y_test)
# np.save("../data/processed_data/y_val.npy", y_val)

In [None]:
# x_train = np.load("../data/processed_data/x_train.npy")
# x_val = np.load("../data/processed_data/x_val.npy")
# y_train = np.load("../data/processed_data/y_train.npy")
# y_val = np.load("../data/processed_data/y_val.npy")
# x_test = np.load("../data/processed_data/x_test.npy")
# y_test = np.load("../data/processed_data/y_test.npy")

In [None]:
rf_pca_model = RandomForestRegressor()
rf_pca_model.fit(x_train,
             y_train.values.ravel())

In [None]:
save_pickle_model(rf_pca_model, os.path.join(
    model_path, "rf_pca_model.pickle"), "wb") 

In [None]:
# predictions and scoring

rf_pca_model = load_pickle_model(os.path.join(
    model_path, "rf_pca_model.pickle"), "rb")

# r2_score
rf_pca_train_predicted = rf_pca_model.predict(x_train)
rf_pca_score_train = round(max(0, 100)*r2_score(y_train.values.ravel(), 
                                      rf_pca_train_predicted),2)
rf_pca_val_predicted = rf_pca_model.predict(x_val)
rf_pca_score_val = round(max(0, 100)*r2_score(y_val, 
                                    rf_pca_val_predicted),2)
# rmse
rf_pca_rmse_train =  round(mean_squared_error(y_train, rf_pca_train_predicted),2)
rf_pca_rmse_val = round(mean_squared_error(y_val, rf_pca_val_predicted),2)

rf_pca_metric = {}
rf_pca_metric["train"]={}
rf_pca_metric["val"]={}
rf_pca_metric["train"]["r2_score"] = rf_pca_score_train
rf_pca_metric["val"]["r2_score"] = rf_pca_score_val
rf_pca_metric["train"]["rmse_train"] = rf_pca_rmse_train
rf_pca_metric["val"]["rmse_val"] = rf_pca_rmse_val

save_dict_to_json(rf_pca_metric, os.path.join(
    metrics_path, 'rf_pca_metric.json'), 'w')
rf_metric

### GradientBoosting Regressor


In [None]:
# model development

# params = {'n_estimators':[100, 500],
#         'learning_rate':[0.01, 0.001],
#         'min_samples_leaf':[1,10,100],
#         'min_samples_split': [5, 10],
#          'tol': [0.001, 0.0001]}


gb_model = GradientBoostingRegressor()
# gb_model = GridSearchCV(model,
#                          cv=10,
#                          param_grid=params,
#                          n_jobs=-1,
#                          verbose=5,
#                          scoring='neg_mean_absolute_error')
gb_model.fit(x_train,
             y_train.values.ravel())
# gb_model.best_estimator_

In [None]:
# saving model to disk

save_pickle_model(gb_model, os.path.join(
    model_path, "gb_model.pickle"), "wb") 

In [None]:
# predictions and scoring

gbm_train_predicted = gb_model.predict(x_train)
gbm_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                       gbm_train_predicted)
gbm_val_predicted = gb_model.predict(x_val)
gbm_score_val = max(0, 100)*r2_score(y_val, 
                                     gbm_val_predicted)
# rmse
gbm_rmse_train =  round(mean_squared_error(y_train, gbm_train_predicted),2)
gbm_rmse_val = round(mean_squared_error(y_val, gbm_val_predicted),2)

gbm_metric = {}
gbm_metric["train"]={}
gbm_metric["val"]={}
gbm_metric["train"]["r2_score"] = gbm_score_train
gbm_metric["val"]["r2_score"] = gbm_score_val
gbm_metric["train"]["rmse_train"] = gbm_rmse_train
gbm_metric["val"]["rmse_val"] = gbm_rmse_val
save_dict_to_json(gbm_metric, os.path.join(
    metrics_path, 'gbm_metric.json'), 'w')
gbm_metric

In [None]:
feats = {} 
for feature, importance in zip(x_train.columns, gb_model.feature_importances_):
    feats[feature] = importance 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
feature_importances = importances.sort_values(by='Gini-importance', ascending=False).reset_index()

feature_importances.to_csv(os.path.join(feature_importance_path, 
                                     "feature_importances__%s.csv"%timestr),
                        index=False)

feature_importances.head(20)

In [None]:
keep_features_list = feature_importances["index"].head(15)
keep_features_list
x_train_imp = x_train[keep_features_list]
x_val_imp  = x_train[keep_features_list]

# model development

gbm_model_imp = GradientBoostingRegressor()
gbm_model_imp.fit(x_train,
             y_train.values.ravel())

In [None]:
# saving model to disk

save_pickle_model(gbm_model_imp, os.path.join(
    model_path, "gbm_model_imp.pickle"), "wb") 

In [None]:
# predictions and scoring

gbm_model_imp = load_pickle_model(os.path.join(
    model_path, "gbm_model_imp.pickle"), "rb")

gbm_imp_train_predicted = gbm_model_imp.predict(x_train)
gbm_imp_score_train = max(0, 100)*r2_score(y_train.values.ravel(), 
                                      gbm_imp_train_predicted)
gbm_imp_val_predicted = gbm_model_imp.predict(x_val)
gbm_imp_score_val = max(0, 100)*r2_score(y_val, 
                                    gbm_imp_val_predicted)

# rmse
gbm_imp_rmse_train =  round(mean_squared_error(y_train, gbm_imp_train_predicted),2)
gbm_imp_rmse_val = round(mean_squared_error(y_val, gbm_imp_val_predicted),2)

gbm_imp_metric = {}
gbm_imp_metric["train"]={}
gbm_imp_metric["val"]={}
gbm_imp_metric["train"]["r2_score"] = gbm_imp_score_train
gbm_imp_metric["val"]["r2_score"] = gbm_imp_score_val
gbm_imp_metric["train"]["rmse_train"] = gbm_imp_rmse_train
gbm_imp_metric["val"]["rmse_val"] = gbm_imp_rmse_val


save_dict_to_json(gbm_imp_metric, os.path.join(
    metrics_path, 'gbm_imp_metric.json'), 'w')
rf_imp_metric

##### scaling data

<br> Scaling is done to Normalize data so that priority is not given to a particular feature. 
<br> Role of Scaling is mostly important in algorithms that are distance based and require Euclidean Distance. 
<br> Random Forest is a tree-based model and hence does not require feature scaling.

In [None]:
# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
# x_test_scaled = scaler.transform(x_test)

In [None]:
x_train_scaled.shape

### SGD Regressor

In [None]:
# model development

# params = {'penalty':['l1','l2'],
#           'tol':[0.001, 0.0001],
#           'alpha': [0.0001, 0.00001],
#          'max_iter': [500, 1000, 1500]}

ln_model = SGDRegressor(loss='squared_loss')

# ln_model = GridSearchCV(model,
#                          cv=10,
#                          param_grid=params,
#                          n_jobs=-1,
#                          verbose=5,
#                          scoring='neg_mean_absolute_error')
ln_model.fit(x_train_scaled, 
             y_train.values.ravel())

In [None]:
# saving model to disk

save_pickle_model(ln_model, os.path.join(
    model_path, "ln_model.pickle"), "wb") 

In [None]:
# predictions and scoring

linear_train_predicted = ln_model.predict(x_train_scaled)
linear_score_train = round(max(0, 100)*r2_score(y_train.values.ravel(), 
                                          linear_train_predicted),2)
linear_val_predicted = ln_model.predict(x_val_scaled)
linear_score_val = round(max(0, 100)*r2_score(y_val, 
                                        linear_val_predicted),2)
# rmse
linear_rmse_train =  round(mean_squared_error(y_train, linear_train_predicted),2)
linear_rmse_val = round(mean_squared_error(y_val, linear_val_predicted),2)

linear_metric = {}
linear_metric["train"]={}
linear_metric["val"]={}
linear_metric["train"]["r2_score"] = linear_score_train
linear_metric["val"]["r2_score"] = linear_score_val
linear_metric["train"]["rmse_train"] = linear_rmse_train
linear_metric["val"]["rmse_val"] = linear_rmse_val
save_dict_to_json(linear_metric, os.path.join(
    metrics_path, 'linear_metric.json'), 'w')
linear_metric

### Vif_drop_columns

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor    

def calculate_vif_(X, thresh=5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    print(X.columns[variables])
    return X.iloc[:, variables]

In [None]:
calculate_vif_(x_train, 5)

In [6]:
vif_keep_col = ['7days_all_gap_days', 'all_gap_7days_last_vs_previous',
       '30days_all_gap_days', 'all_gap_30days_last_vs_previous',
       'mtd_all_gap_days', 'all_gap_mtd_previous_days', 'all_last_day',
       'all_last30_stable', 'all_last30_inc_count', 'all_consistency_index',
       'avg_all_gap_days_d1_10', 'avg_all_gap_days_d11_20',
       'all_gap_days_d1_10_thisvsprev', 'all_gap_days_d11_20_thisvsprev',
       'all_gap_days_d20_31_thisvsprev', 'all_7days_min_thisvs4w',
       'all_7days_trend_vs4weeks', 'all_7days_trend_vs10weeks',
       'all_7days_vslast_month7days', 'all_7days_max_thisvs10w',
       'all_ystrday_vsmin10d', 'all_ystrday_trend_vs10d',
       'all_ystrday_vsdaybfr', 'all_mrr_trend_vs6M', 'all_lst30days_vsprvmnth',
       'all_mtd_vs_min_lst3M', 'all_trend_mtdvs3M_sameday',
       'all_norm_growth_m1', 'all_norm_growth_m2', 'all_norm_growth_m3',
       'all_norm_growth_m4', 'all_norm_growth_m5', 'all_norm_growth_m6',
       'all_norm_growth_index_last', 'all_gtv_last12Months_m12',
       'all_gtv_last10days_d3', 'all_gtv_last10days_d4',
       'all_gtv_last10days_d6', 'all_gtv_last10days_d7',
       'all_gtv_last10days_d8', 'all_gtv_last10days_d9']

In [7]:
x_train = x_train[vif_keep_col]
x_val = x_val[vif_keep_col]

In [None]:
from pandas_profiling import ProfileReport
profile_report = ProfileReport(x_train, title="Pandas Profiling Report")
profile_report.to_widgets()

In [None]:
data_exploration_path = os.path.join(output_path, "data_exploration")

profile_report.to_file(os.path.join(data_exploration_path, "profile_report_2__%s.html"%timestr)) 

### Test Predictions

In [None]:
x_pk_predict = x_test[primary_key]
x_predict = x_test.drop([primary_key],1)

predicted = rf_model_imp.predict(x_predict)
predict_df = pd.DataFrame(predicted, 
                          columns = ["Loan Sanction Amount (USD)"])

output_dataframe = pd.merge(x_pk_predict, 
                            predict_df, 
                            how="left", 
                            left_index=True, 
                            right_index=True)

In [None]:
output_dataframe.to_csv(os.path.join(predict_path, 
                                     "rf_model_imp__%s.csv"%timestr),
                        index=False)