In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import numpy as np
import xgboost
from sklearn.linear_model import LogisticRegression
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
%run /Users/sahayk/risk_ml_modeling_framework/feature_selection

In [None]:
%run /users/sahayk/risk_ml_modeling_framework/random_search

In [None]:
%run /Users/sahayk/risk_ml_modeling_framework/model_evaluation

In [None]:
def featureImportance(model, X):
  feat_imp = pd.DataFrame(model.feature_importances_, index = X.columns)
  feat_imp = feat_imp.reset_index()
  feat_imp.columns = ['feature', 'feature_importance']
  feat_imp = feat_imp.sort_values(by = 'feature_importance', ascending = False)
  feat_imp['cumulative_feature_importance'] = feat_imp['feature_importance'].cumsum()

  f = 'gain'
  feat_gain = pd.DataFrame.from_dict(model.get_booster().get_score(importance_type = f), orient = 'index')
  feat_gain = feat_gain.reset_index()
  feat_gain.columns = ['feature', 'gain']

  feat_imp = feat_imp.merge(feat_gain, on = 'feature', how = 'left')
  feat_imp = feat_imp.sort_values('gain', ascending = False)
  return feat_imp

def formatKSReport(ks_report):
  ks_report.columns = ['decile', 'min', 'max', 'event', 'non_event', 'total', 'event_rate', 'cumulative_event', 'cumulative_non_event', 'ks', 'max_ks', 'l_cumulative_event', 'cumulative_total', 'l_cumulative_total', 'area', 'cumulative_area', 'area_b', 'pi', 'gini', 'auc']
  ks_report['max_ks'] = np.where(ks_report['max_ks'] == '<----', '<', ks_report['max_ks'])
  ks_report['decile'] = ks_report['decile'] + 5
  ks_report['cumulative_event'] = ks_report['cumulative_event']/100
  ks_report['cumulative_non_event'] = ks_report['cumulative_non_event']/100
  ks_report['ks'] = ks_report['ks']/100
  return ks_report

In [None]:
input_vars = [
'duns',
'totempl',
'cpoints',
'fspoints',
'hi_cr_tot_amt',
'inds_norm_pydx_scr',
'facy_sq_ft_msmt',
'ucc_flng_3yr_cnt',
'hl_ceo_chg_code',
'fam_mbr_cnt',
'acct_cnt',
'extl_url_on_hmpg_cnt',
'res_pop_code',
'tot_12_mo_acct',
'tot_amt_owg_rcnt',
'hi_cr_3_mo',
'tot_12_mo_pastdue_acc',
'mths_snc_hi_pdue',
'pdx_12mth_curr',
'tot_avg_past30p_12mths',
'tot_avg_past90p_12mths',
'spnd_buyers_12m',
'spnd_min_wsfo_12m',
'spnd_dx13_12m',
'spnd_avapo_r_12m',
'spnd_avapo_f_12m',
'spnd_max_avapo_o_12m',
'spnd_dllrs_o_12m',
'spnd_dllrs_12m',
'spnd_max_avapo_f_12m',
'spnd_max_avapo_r_12m',
'spnd_avapo_12m',
'spnd_dx40_b_12m',
'ba_count_info_src_12m',
'ba_cr_activity_12m',
'ba_cr_cat_insur_12m',
'ba_cr_cat_marketing_12m',
'ba_cr_stability_12m',
'ba_sum_excl_12m',
'ba_sum_info_gov_12m',
'ba_cnt_cr_12m',
'ba_cr_cat_other_12m',
'ba_cr_cat_online_3m',
'drp_npayexp_loc_decile',
'drp_pexp_cr_loc_decile',
'drp_paydex1_loc_decile',
'drp_pslow90p_loc_decile',
'drp_sales_loc_decile',
'drp_tlp_score_loc_decile',
'loc_avg_npayexp',
'loc_avg_dol30pl',
'location_growth_score',
'loc_pct_rent_1',
'loc_avg_sales',
'loc_pct_hqbr_b',
'loc_pct_hqbr_r',
'loc_pct_hqbr_s',
'loc_pct_comptype_g',
'loc_pct_via3_robust',
'chg_phon',
'chg_tot_emp',
'inq_sic50_inq_24m',
'inq_inquirer_sic_6_60m',
'inq_inquirer_sic_0_60m',
'inq_inquirer_sic_9_60m',
'inq_inquirer_sic_5_60m',
'inq_sic47_inq_60m',
'inq_inquiry_duns_48m',
'gctrs_cnt_unq_customer_country',
'gctrs_cnt_unq_yrs',
'gctrs_ttl_signal_3yrs',
'ind_gctrs_3yrs',
'npayexp',
'satis',
'emplhere',
'numpaydex7',
'sales',
'bnkrpt',
'nnetwrthsign',
'ngndr',
'nlocn',
'nimptexpt',
'nbusowrp',
'nrectyp',
'sml_bus_ind',
'public_ind',
'womn_ownd_ind',
'miny_ownd_ind',
'univ_ind',
'pydexvar',
'lien_ind',
'suits_ind',
'judg_ind',
'hca_pct_hcm',
'ncomptype',
'nloc',
'location_cluster_score_k500',
'sic4_score',
'exp_prop_bus_nme_ind',
'foreign_trade_buyer_ind',
'export_job_title_ind',
'sample_type',
'export',
'append_year',
'append_month']

In [None]:
loc_scr_train_df = spark.sql('select duns, location_cluster_score1 as location_cluster_score_k500, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_train_location_cluster_score_derived_attribute_k500')
loc_scr_val_df = spark.sql('select duns, location_cluster_score1 as location_cluster_score_k500, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_val_location_cluster_score_derived_attribute_k500')
loc_scr_test_df = spark.sql('select duns, location_cluster_score1 as location_cluster_score_k500, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_test_location_cluster_score_derived_attribute_k500')

loc_scr_train_df = loc_scr_train_df.toPandas()
loc_scr_val_df = loc_scr_val_df.toPandas()
loc_scr_test_df = loc_scr_test_df.toPandas()

trade_train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train_importer_derived_attribute')
val_train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val_importer_derived_attribute')
test_train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test_importer_derived_attribute')

trade_train_df = trade_train.toPandas()
val_train_df = val_train.toPandas()
test_train_df = test_train.toPandas()

train_jobtitle = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train_job_title_derived_attribute')
val_jobtitle = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val_job_title_derived_attribute')
test_jobtitle = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test_job_title_derived_attribute')

train_jobtitle_df = train_jobtitle.toPandas()
val_jobtitle_df = val_jobtitle.toPandas()
test_jobtitle_df = test_jobtitle.toPandas()

In [None]:
df = pd.read_csv('/dbfs/FileStore/sahayk/us_export_propensity/input_data/us_export_propensity_data_prep_and_derivations.csv')

df['ind_gctrs_3yrs'] = np.where(df['ind_gctrs_3yrs'] != 1, 0, df['ind_gctrs_3yrs'])

df['weight'] = 1

train = df[df['sample_type'] == 'train']
val = df[df['sample_type'] == 'val']
test = df[df['sample_type'] == 'test']

train = pd.merge(train, loc_scr_train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val = pd.merge(val, loc_scr_val_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test = pd.merge(test, loc_scr_test_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train = pd.merge(train, trade_train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val = pd.merge(val, val_train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test = pd.merge(test, test_train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train = pd.merge(train, train_jobtitle_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val = pd.merge(val, val_jobtitle_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test = pd.merge(test, test_jobtitle_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

non_predictors = ['duns', 'export', 'sample_type', 'append_year', 'append_month', 'weight']
predictors = list(set(input_vars) - set(non_predictors))

train2 = train.copy()
val2 = val.copy()
test2 = test.copy()

train2['drop_ind'] = np.where((train2['export'] == 1) & (train2['sic4_score'].isna()), 1, 0)
val2['drop_ind'] = np.where((val2['export'] == 1) & (val2['sic4_score'].isna()), 1, 0)
test2['drop_ind'] = np.where((test2['export'] == 1) & (test2['sic4_score'].isna()), 1, 0)

train2 = train2[train2['drop_ind'] == 0]
val2 = val2[val2['drop_ind'] == 0]
test2 = test2[test2['drop_ind'] == 0]

In [None]:
params = "{'colsample_bylevel':np.random.randint(5,10)/10,'colsample_bytree':np.random.randint(5,10)/10,'learning_rate': np.random.choice([0.1 , 0.01 , 0.05 , 0.075]),'max_depth':np.random.randint(3,6),'min_child_weight':np.random.randint(3,10),'n_estimators': np.random.choice([300,400,500,600]),'subsample':np.random.randint(5,10)/10}"

rs = ks_search(100, params, train2[predictors], train2['export'], train2['weight'], val2[predictors], val2['export'], val2['weight'])

rs.randomsearch_KS()

In [None]:
output_full = rs.result_out
hyper_param = pd.concat(output_full)
hyper_param.columns = ['train', 'val', 'diff', 'event_rate', 'min_prob', 'max_prob', 'predicted_event_rate', 'hyper_param', 'iteration', 'best_n_tree']

random_search_result = pd.concat([hyper_param.drop(['hyper_param'], axis = 1), hyper_param['hyper_param'].map(eval).apply(pd.Series)], axis = 1).reset_index()
random_search_result.columns = ['metric', 'train', 'val', 'diff', 'event_rate', 'min_prob', 'max_prob', 'predicted_event_rate', 'iteration', 'best_n_tree', 'colsample_bylevel', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimator', 'subsample']

In [None]:
model1 = xgb.XGBClassifier(colsample_bylevel = 0.6, colsample_bytree = 0.6, learning_rate = 0.075, max_depth = 3, min_child_weight = 9, n_estimators = 600, subsample = 0.5)

model1.fit(train2[predictors], train2['export'], sample_weight = train2['weight'], eval_set = [(val2[predictors], val2['export'], val2['weight'])], eval_metric = 'auc', early_stopping_rounds = 8)

feature_list = list(model1.get_booster().feature_names)

train2['predicted_export'] = model1.predict_proba(train2[feature_list], ntree_limit = model1.best_ntree_limit)[:,1]
val2['predicted_export'] = model1.predict_proba(val2[feature_list], ntree_limit = model1.best_ntree_limit)[:,1]
test2['predicted_export'] = model1.predict_proba(test2[feature_list], ntree_limit = model1.best_ntree_limit)[:,1]

feat_imp = featureImportance(model1, train2[feature_list])

In [None]:
display(feat_imp)

In [None]:
ks = metrics(None, None, None, None, None, None)
ks_report_train = ks.KS_train(train2['export'], train2['predicted_export'], train2['weight'], bins = 10)
ks_report_val = ks.KS_train(val2['export'], val2['predicted_export'], val2['weight'], bins = 10)
ks_report_test = ks.KS_train(test2['export'], test2['predicted_export'], test2['weight'], bins = 10)

ks_report_train = formatKSReport(ks_report_train)
ks_report_val = formatKSReport(ks_report_val)
ks_report_test = formatKSReport(ks_report_test)

In [None]:
display(ks_report_train)

In [None]:
display(ks_report_val)

In [None]:
display(ks_report_test)

In [None]:
predictors2 = list(feat_imp[feat_imp['cumulative_feature_importance'] <= 0.90]['feature'])

params = "{'colsample_bylevel':np.random.randint(5,10)/10,'colsample_bytree':np.random.randint(5,10)/10,'learning_rate': np.random.choice([0.1 , 0.01 , 0.05 , 0.075]),'max_depth':np.random.randint(3,5),'min_child_weight':np.random.randint(3,10),'n_estimators': np.random.choice([300,400,500]),'subsample':np.random.randint(5,10)/10}"

rs2 = ks_search(50, params, train2[predictors2], train2['export'], train2['weight'], val2[predictors2], val2['export'], val2['weight'])

rs2.randomsearch_KS()

In [None]:
output_full2 = rs2.result_out
hyper_param2 = pd.concat(output_full2)
hyper_param2.columns = ['train', 'val', 'diff', 'event_rate', 'min_prob', 'max_prob', 'predicted_event_rate', 'hyper_param', 'iteration', 'best_n_tree']

random_search_result2 = pd.concat([hyper_param2.drop(['hyper_param'], axis = 1), hyper_param2['hyper_param'].map(eval).apply(pd.Series)], axis = 1).reset_index()
random_search_result2.columns = ['metric', 'train', 'val', 'diff', 'event_rate', 'min_prob', 'max_prob', 'predicted_event_rate', 'iteration', 'best_n_tree', 'colsample_bylevel', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimator', 'subsample']

In [None]:
model2 = xgb.XGBClassifier(colsample_bylevel = 0.7, colsample_bytree = 0.9, learning_rate = 0.05, max_depth = 4, min_child_weight = 9, n_estimators = 300, subsample = 0.7)

model2.fit(train2[predictors2], train2['export'], sample_weight = train2['weight'], eval_set = [(val2[predictors2], val2['export'], val2['weight'])], eval_metric = 'auc', early_stopping_rounds = 8)

feature_list2 = list(model2.get_booster().feature_names)

train2['predicted_export2'] = model2.predict_proba(train2[feature_list2], ntree_limit = model2.best_ntree_limit)[:,1]
val2['predicted_export2'] = model2.predict_proba(val2[feature_list2], ntree_limit = model2.best_ntree_limit)[:,1]
test2['predicted_export2'] = model2.predict_proba(test2[feature_list2], ntree_limit = model2.best_ntree_limit)[:,1]

feat_imp2 = featureImportance(model2, train2[feature_list2])

display(feat_imp2)

In [None]:
ks2 = metrics(None, None, None, None, None, None)
ks_report_train2 = ks2.KS_train(train2['export'], train2['predicted_export2'], train2['weight'], bins = 10)
ks_report_val2 = ks2.KS_train(val2['export'], val2['predicted_export2'], val2['weight'], bins = 10)
ks_report_test2 = ks2.KS_train(test2['export'], test2['predicted_export2'], test2['weight'], bins = 10)

ks_report_train2 = formatKSReport(ks_report_train2)
ks_report_val2 = formatKSReport(ks_report_val2)
ks_report_test2 = formatKSReport(ks_report_test2)

In [None]:
display(ks_report_train2)

In [None]:
display(ks_report_val2)

In [None]:
display(ks_report_test2)

In [None]:
train_final = train2[non_predictors + feature_list2 + ['predicted_export2']]
val_final = val2[non_predictors + feature_list2 + ['predicted_export2']]
test_final = test2[non_predictors + feature_list2 + ['predicted_export2']]

final_features = list(feat_imp2['feature'])

train_final['weight'] = 1
fs = preprocess(train_final[final_features], train_final['export'],  train_final['weight'])
fs.identify_all({'missing_threshold': 0.95, 'correlation_threshold': 0.90})

binning_data = fs.iv_df.astype(str)
binning_data.columns = ['variable_name', 'min_value', 'max_value', 'count', 'dist_count', 'event', 'event_rate', 'non_event', 'non_event_rate',
                       'dist_event', 'dist_non_event', 'woe', 'force', 'iv_bin', 'iv_overall']
binning_data2 = binning_data.drop(columns = 'force')

In [None]:
train_ks_list = []
val_ks_list = []
train_gini_list = []
val_gini_list = []
for i in list(train_final[predictors2].columns):
  print("[INFO] Variable Name: {}".format(i))
  train_tmp = train_final[~train_final[i].isna()]
  val_tmp = val_final[~val_final[i].isna()]
  clf = LogisticRegression(random_state=0).fit(train_tmp[[i]], train_tmp[['export']])
  train_tmp['unv_predicted_export'] = clf.predict_proba(train_tmp[[i]])[:,1]
  val_tmp['unv_predicted_export'] = clf.predict_proba(val_tmp[[i]])[:,1]
  ks = metrics(None, None, None, None, None, None)
  ks_report_train = ks.KS_train(train_tmp['export'], train_tmp['unv_predicted_export'], train_tmp['weight'], bins = 10)
  ks_report_val = ks.KS_train(val_tmp['export'], val_tmp['unv_predicted_export'], val_tmp['weight'], bins = 10)
  ks_report_train = formatKSReport(ks_report_train)
  ks_report_val = formatKSReport(ks_report_val)
  train_ks = ks_report_train[ks_report_train['max_ks'] == '<']['ks'].values[0]
  train_gini = ks_report_train['gini'].values[0]
  val_ks = ks_report_val[ks_report_val['max_ks'] == '<']['ks'].values[0]
  val_gini = ks_report_val['gini'].values[0]
  train_ks_list.append(train_ks)
  train_gini_list.append(train_gini)
  val_ks_list.append(val_ks)
  val_gini_list.append(val_gini)

d = {'variable_name': list(train_final[predictors2].columns), 'train_ks': train_ks_list, 'val_ks': val_ks_list, 'train_gini': train_gini_list, 'val_gini': val_gini_list}
ks_gini = pd.DataFrame(data = d)
ks_gini['ks_diff'] = ks_gini['train_ks'] - ks_gini['val_ks']
ks_gini['gini_diff'] = ks_gini['train_gini'] - ks_gini['val_gini']

In [None]:
display(ks_gini)

In [None]:
corr = train_final[predictors2].corr().abs().unstack().reset_index()
corr.columns = ['x1', 'x2', 'correlation_coefficient']
corr = corr[(corr['x1'] != corr['x2'])]
hi_corr = corr[corr['correlation_coefficient'] >= 0.8]

In [None]:
display(hi_corr)

In [None]:
predictors3 = [
'gctrs_ttl_signal_3yrs',
'ind_gctrs_3yrs',
'gctrs_cnt_unq_yrs',
'npayexp',
'sic4_score',
'nloc',
'gctrs_cnt_unq_customer_country',
'nrectyp',
'satis',
'sales',
'drp_paydex1_loc_decile',
'location_growth_score',
'ncomptype',
'nimptexpt',
'foreign_trade_buyer_ind',
'location_cluster_score_k500',
'miny_ownd_ind',
'export_job_title_ind',
'ba_sum_excl_12m',
'loc_pct_rent_1',
'sml_bus_ind',
'loc_pct_comptype_g',
'exp_prop_bus_nme_ind',
'ucc_flng_3yr_cnt',
'ba_count_info_src_12m',
'chg_tot_emp',
'inds_norm_pydx_scr',
'drp_sales_loc_decile']

In [None]:
params = "{'colsample_bylevel':np.random.randint(5,10)/10,'colsample_bytree':np.random.randint(5,10)/10,'learning_rate': np.random.choice([0.1 , 0.01 , 0.05 , 0.075]),'max_depth':np.random.randint(3,5),'min_child_weight':np.random.randint(5,10),'n_estimators': np.random.choice([300,400,500]),'subsample':np.random.randint(5,10)/10}"

rs3 = ks_search(50, params, train2[predictors3], train2['export'], train2['weight'], val2[predictors3], val2['export'], val2['weight'])
rs3.randomsearch_KS()

output_full3 = rs3.result_out
hyper_param3 = pd.concat(output_full3)
hyper_param3.columns = ['train', 'val', 'diff', 'event_rate', 'min_prob', 'max_prob', 'predicted_event_rate', 'hyper_param', 'iteration', 'best_n_tree']

random_search_result3 = pd.concat([hyper_param3.drop(['hyper_param'], axis = 1), hyper_param3['hyper_param'].map(eval).apply(pd.Series)], axis = 1).reset_index()
random_search_result3.columns = ['metric', 'train', 'val', 'diff', 'event_rate', 'min_prob', 'max_prob', 'predicted_event_rate', 'iteration', 'best_n_tree', 'colsample_bylevel', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimator', 'subsample']

model3 = xgb.XGBClassifier(colsample_bylevel = 0.5, colsample_bytree = 0.6, learning_rate = 0.05, max_depth = 3, min_child_weight = 6, n_estimators = 400, subsample = 0.6)
model3.fit(train2[predictors3], train2['export'], sample_weight = train2['weight'], eval_set = [(val2[predictors3], val2['export'], val2['weight'])], eval_metric = 'auc', early_stopping_rounds = 8)
feature_list3 = list(model3.get_booster().feature_names)
train2['predicted_export3'] = model3.predict_proba(train2[feature_list3], ntree_limit = model3.best_ntree_limit)[:,1]
val2['predicted_export3'] = model3.predict_proba(val2[feature_list3], ntree_limit = model3.best_ntree_limit)[:,1]
test2['predicted_export3'] = model3.predict_proba(test2[feature_list3], ntree_limit = model3.best_ntree_limit)[:,1]
train2['predicted_export3'].mean(), val2['predicted_export3'].mean(), test2['predicted_export3'].mean()

feat_imp3 = featureImportance(model3, train2[feature_list3])

In [None]:
display(feat_imp3)

In [None]:
ks3 = metrics(None, None, None, None, None, None)
ks_report_train3 = ks3.KS_train(train2['export'], train2['predicted_export3'], train2['weight'], bins = 10)
ks_report_val3 = ks3.KS_train(val2['export'], val2['predicted_export3'], val2['weight'], bins = 10)
ks_report_test3 = ks3.KS_train(test2['export'], test2['predicted_export3'], test2['weight'], bins = 10)

ks_report_train3 = formatKSReport(ks_report_train3)
ks_report_val3 = formatKSReport(ks_report_val3)
ks_report_test3 = formatKSReport(ks_report_test3)

In [None]:
display(ks_report_train3)

In [None]:
display(ks_report_val3)

In [None]:
display(ks_report_test3)

In [None]:
pickle.dump(model3, open('/dbfs/FileStore/sahayk/us_export_propensity/pickles/us_export_propensity_model_20210912.pkl', 'wb'))

In [None]:
train_final = train2[non_predictors + feature_list3 + ['predicted_export3']]
val_final = val2[non_predictors + feature_list3 + ['predicted_export3']]
test_final = test2[non_predictors + feature_list3 + ['predicted_export3']]

pd.concat([train_final, val_final, test_final], ignore_index = True).to_csv('/dbfs/FileStore/sahayk/us_export_propensity/output_data/us_export_propensity_model_output_20210912.csv', index = False)

In [None]:
final_features = list(feat_imp3['feature'])

train_final['weight'] = 1
fs = preprocess(train_final[final_features], train_final['export'],  train_final['weight'])
fs.identify_all({'missing_threshold': 0.95, 'correlation_threshold': 0.90})

binning_data = fs.iv_df.astype(str)
binning_data.columns = ['variable_name', 'min_value', 'max_value', 'count', 'dist_count', 'event', 'event_rate', 'non_event', 'non_event_rate',
                       'dist_event', 'dist_non_event', 'woe', 'force', 'iv_bin', 'iv_overall']
binning_data2 = binning_data.drop(columns = 'force')

In [None]:
display(binning_data2)

In [None]:
train_ks_list = []
val_ks_list = []
train_gini_list = []
val_gini_list = []
for i in list(train_final[predictors3].columns):
  print("[INFO] Variable Name: {}".format(i))
  train_tmp = train_final[~train_final[i].isna()]
  val_tmp = val_final[~val_final[i].isna()]
  clf = LogisticRegression(random_state=0).fit(train_tmp[[i]], train_tmp[['export']])
  train_tmp['unv_predicted_export'] = clf.predict_proba(train_tmp[[i]])[:,1]
  val_tmp['unv_predicted_export'] = clf.predict_proba(val_tmp[[i]])[:,1]
  ks = metrics(None, None, None, None, None, None)
  ks_report_train = ks.KS_train(train_tmp['export'], train_tmp['unv_predicted_export'], train_tmp['weight'], bins = 10)
  ks_report_val = ks.KS_train(val_tmp['export'], val_tmp['unv_predicted_export'], val_tmp['weight'], bins = 10)
  ks_report_train = formatKSReport(ks_report_train)
  ks_report_val = formatKSReport(ks_report_val)
  train_ks = ks_report_train[ks_report_train['max_ks'] == '<']['ks'].values[0]
  train_gini = ks_report_train['gini'].values[0]
  val_ks = ks_report_val[ks_report_val['max_ks'] == '<']['ks'].values[0]
  val_gini = ks_report_val['gini'].values[0]
  train_ks_list.append(train_ks)
  train_gini_list.append(train_gini)
  val_ks_list.append(val_ks)
  val_gini_list.append(val_gini)

In [None]:
d = {'variable_name': list(train_final[predictors3].columns), 'train_ks': train_ks_list, 'val_ks': val_ks_list, 'train_gini': train_gini_list, 'val_gini': val_gini_list}
ks_gini = pd.DataFrame(data = d)
ks_gini['ks_diff'] = ks_gini['train_ks'] - ks_gini['val_ks']
ks_gini['gini_diff'] = ks_gini['train_gini'] - ks_gini['val_gini']

In [None]:
display(ks_gini)