In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import numpy as np
import xgboost
import pickle
import warnings
warnings.filterwarnings('ignore')
np.random.seed(32)

In [None]:
%run /users/sahayk/risk_ml_modeling_framework/feature_selection

In [None]:
%run /users/sahayk/risk_ml_modeling_framework/random_search

In [None]:
%run /users/sahayk/risk_ml_modeling_framework/model_evaluation

In [None]:
def featureImportance(model, X):
  feat_imp = pd.DataFrame(model.feature_importances_, index = X.columns)
  feat_imp = feat_imp.reset_index()
  feat_imp.columns = ['feature', 'feature_importance']
  feat_imp = feat_imp.sort_values(by = 'feature_importance', ascending = False)
  feat_imp['cumulative_feature_importance'] = feat_imp['feature_importance'].cumsum()

  f = 'gain'
  feat_gain = pd.DataFrame.from_dict(model.get_booster().get_score(importance_type = f), orient = 'index')
  feat_gain = feat_gain.reset_index()
  feat_gain.columns = ['feature', 'gain']

  feat_imp = feat_imp.merge(feat_gain, on = 'feature', how = 'left')
  feat_imp = feat_imp.sort_values('gain', ascending = False)
  return feat_imp

def formatKSReport(ks_report):
  ks_report.columns = ['decile', 'min', 'max', 'event', 'non_event', 'total', 'event_rate', 'cumulative_event', 'cumulative_non_event', 'ks', 'max_ks', 'l_cumulative_event', 'cumulative_total', 'l_cumulative_total', 'area', 'cumulative_area', 'area_b', 'pi', 'gini', 'auc']
  ks_report['max_ks'] = np.where(ks_report['max_ks'] == '<----', '<', ks_report['max_ks'])
  ks_report['decile'] = ks_report['decile'] + 5
  ks_report['cumulative_event'] = ks_report['cumulative_event']/100
  ks_report['cumulative_non_event'] = ks_report['cumulative_non_event']/100
  ks_report['ks'] = ks_report['ks']/100
  return ks_report

In [None]:
df = pd.read_csv('/dbfs/FileStore/sahayk/us_export_propensity/input_data/us_export_propensity_data_prep_and_derivations.csv')

loc_scr_train_df = spark.sql('select duns, location_cluster_score1 as location_cluster_score_k500, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_train_location_cluster_score_derived_attribute_k500')
loc_scr_val_df = spark.sql('select duns, location_cluster_score1 as location_cluster_score_k500, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_val_location_cluster_score_derived_attribute_k500')
loc_scr_test_df = spark.sql('select duns, location_cluster_score1 as location_cluster_score_k500, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_test_location_cluster_score_derived_attribute_k500')

loc_scr_train_df = loc_scr_train_df.toPandas()
loc_scr_val_df = loc_scr_val_df.toPandas()
loc_scr_test_df = loc_scr_test_df.toPandas()

trade_train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train_importer_derived_attribute')
trade_val = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val_importer_derived_attribute')
trade_test = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test_importer_derived_attribute')

trade_train_df = trade_train.toPandas()
trade_val_df = trade_val.toPandas()
trade_test_df = trade_test.toPandas()

train_jobtitle = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train_job_title_derived_attribute')
val_jobtitle = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val_job_title_derived_attribute')
test_jobtitle = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test_job_title_derived_attribute')

train_jobtitle_df = train_jobtitle.toPandas()
val_jobtitle_df = val_jobtitle.toPandas()
test_jobtitle_df = test_jobtitle.toPandas()

df['weight'] = 1
df['ind_gctrs_3yrs'] = np.where(df['ind_gctrs_3yrs'] != 1, 0, df['ind_gctrs_3yrs'])

train = df[df['sample_type'] == 'train']
val = df[df['sample_type'] == 'val']
test = df[df['sample_type'] == 'test']

train = pd.merge(train, loc_scr_train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val = pd.merge(val, loc_scr_val_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test = pd.merge(test, loc_scr_test_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train = pd.merge(train, trade_train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val = pd.merge(val, trade_val_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test = pd.merge(test, trade_test_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train = pd.merge(train, train_jobtitle_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val = pd.merge(val, val_jobtitle_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test = pd.merge(test, test_jobtitle_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train2 = train.copy()
val2 = val.copy()
test2 = test.copy()

non_predictors = ['duns', 'export', 'sample_type', 'append_year', 'append_month', 'weight']
predictors = [
'gctrs_ttl_signal_3yrs',
'ind_gctrs_3yrs',
'gctrs_cnt_unq_yrs',
'npayexp',
'sic4_score',
'nloc',
'gctrs_cnt_unq_customer_country',
'nrectyp',
'satis',
'sales',
'drp_paydex1_loc_decile',
'location_growth_score',
'ncomptype',
'nimptexpt',
'foreign_trade_buyer_ind',
'location_cluster_score',
'miny_ownd_ind',
'export_job_title_ind',
'ba_sum_excl_12m',
'loc_pct_rent_1',
'sml_bus_ind',
'loc_pct_comptype_g',
'export_business_name_ind',
'ucc_flng_3yr_cnt',
'ba_count_info_src_12m',
'chg_tot_emp',
'inds_norm_pydx_scr',
'drp_sales_loc_decile',
]

train2['drop_ind'] = np.where((train2['export'] == 1) & (train2['sic4_score'].isna()), 1, 0)
val2['drop_ind'] = np.where((val2['export'] == 1) & (val2['sic4_score'].isna()), 1, 0)
test2['drop_ind'] = np.where((test2['export'] == 1) & (test2['sic4_score'].isna()), 1, 0)

train2 = train2[train2['drop_ind'] == 0]
val2 = val2[val2['drop_ind'] == 0]
test2 = test2[test2['drop_ind'] == 0]

train2['export'].mean(), val2['export'].mean(), test2['export'].mean()

train2 = train2.rename(columns = {'location_cluster_score_k500': 'location_cluster_score',
                                 'exp_prop_bus_nme_ind': 'export_business_name_ind'})
val2 = val2.rename(columns = {'location_cluster_score_k500': 'location_cluster_score',
                                 'exp_prop_bus_nme_ind': 'export_business_name_ind'})
test2 = test2.rename(columns = {'location_cluster_score_k500': 'location_cluster_score',
                                 'exp_prop_bus_nme_ind': 'export_business_name_ind'})

model = xgb.XGBClassifier(colsample_bylevel = 0.5, colsample_bytree = 0.6, learning_rate = 0.05, max_depth = 3, min_child_weight = 6, n_estimators = 400, subsample = 0.6)
model.fit(train2[predictors], train2['export'], sample_weight = train2['weight'], eval_set = [(val2[predictors], val2['export'], val2['weight'])], eval_metric = 'auc', early_stopping_rounds = 8)

pickle.dump(model, open('/dbfs/FileStore/sahayk/us_export_propensity/pickles/us_export_propensity_model_20220601.pkl', 'wb'))

model = pickle.load(open('/dbfs/FileStore/sahayk/us_export_propensity/pickles/us_export_propensity_model_20220601.pkl', 'rb'))

feature_list = list(model.get_booster().feature_names)

train2['predicted_export'] = model.predict_proba(train2[feature_list], ntree_limit = model.best_ntree_limit)[:,1]
val2['predicted_export'] = model.predict_proba(val2[feature_list], ntree_limit = model.best_ntree_limit)[:,1]
test2['predicted_export'] = model.predict_proba(test2[feature_list], ntree_limit = model.best_ntree_limit)[:,1]

feat_imp = featureImportance(model, train2[feature_list])

In [None]:
display(feat_imp)

In [None]:
ks = metrics(None, None, None, None, None, None)
ks_report_train = ks.KS_train(train2['export'], train2['predicted_export'], train2['weight'], bins = 10)
ks_report_val = ks.KS_train(val2['export'], val2['predicted_export'], val2['weight'], bins = 10)
ks_report_test = ks.KS_train(test2['export'], test2['predicted_export'], test2['weight'], bins = 10)

ks_report_train = formatKSReport(ks_report_train)
ks_report_val = formatKSReport(ks_report_val)
ks_report_test = formatKSReport(ks_report_test)

In [None]:
display(ks_report_train)

In [None]:
display(ks_report_val)

In [None]:
display(ks_report_test)

In [None]:
train_final = train2[non_predictors + feature_list + ['predicted_export']]
val_final = val2[non_predictors + feature_list + ['predicted_export']]
test_final = test2[non_predictors + feature_list + ['predicted_export']]

pd.concat([train_final, val_final, test_final], ignore_index = True).to_csv('/dbfs/FileStore/sahayk/us_export_propensity/output_data/us_export_propensity_model_output.csv', index = False)

In [None]:
shipment_train = spark.sql('select duns, append_year, append_month, export, non_us_shipment as foreign_trade_ind from workarea.us_export_propensity_analytic_dataset_train_shipping_data').dropDuplicates(['duns', 'append_year', 'append_month', 'export']).toPandas()
shipment_val = spark.sql('select duns, append_year, append_month, export, non_us_shipment as foreign_trade_ind from workarea.us_export_propensity_analytic_dataset_val_shipping_data').dropDuplicates(['duns', 'append_year', 'append_month', 'export']).toPandas()
shipment_test = spark.sql('select duns, append_year, append_month, export, non_us_shipment as foreign_trade_ind from workarea.us_export_propensity_analytic_dataset_test_shipping_data').dropDuplicates(['duns', 'append_year', 'append_month', 'export']).toPandas()

train_final2  = pd.merge(train_final, shipment_train, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val_final2  = pd.merge(val_final, shipment_val, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test_final2  = pd.merge(test_final, shipment_test, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train_final2['foreign_trade_ind'] = train_final2['foreign_trade_ind'].fillna(0)
val_final2['foreign_trade_ind'] = val_final2['foreign_trade_ind'].fillna(0)
test_final2['foreign_trade_ind'] = test_final2['foreign_trade_ind'].fillna(0)

train_final2['predicted_export'] = np.where(train_final2['foreign_trade_ind'] == 1, 1, train_final2['predicted_export'])
val_final2['predicted_export'] = np.where(val_final2['foreign_trade_ind'] == 1, 1, val_final2['predicted_export'])
test_final2['predicted_export'] = np.where(test_final2['foreign_trade_ind'] == 1, 1, test_final2['predicted_export'])

In [None]:
ks2 = metrics(None, None, None, None, None, None)
ks_report_train2 = ks2.KS_train(train_final2['export'], train_final2['predicted_export'], train_final2['weight'], bins = 10)
ks_report_val2 = ks2.KS_train(val_final2['export'], val_final2['predicted_export'], val_final2['weight'], bins = 10)
ks_report_test2 = ks2.KS_train(test_final2['export'], test_final2['predicted_export'], test_final2['weight'], bins = 10)

ks_report_train2 = formatKSReport(ks_report_train2)
ks_report_val2 = formatKSReport(ks_report_val2)
ks_report_test2 = formatKSReport(ks_report_test2)

In [None]:
display(ks_report_train2)

In [None]:
display(ks_report_val2)

In [None]:
display(ks_report_test2)

In [None]:
pd.concat([train_final2, val_final2, test_final2], ignore_index = True).to_csv('/dbfs/FileStore/sahayk/us_export_propensity/output_data/us_export_propensity_model_output_override.csv', index = False)