In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import re

In [None]:
%run /users/sahayk/risk_ml_modeling_framework/model_evaluation

In [None]:
def formatKSReport(ks_report):
  ks_report.columns = ['decile', 'min', 'max', 'event', 'non_event', 'total', 'event_rate', 'cumulative_event', 'cumulative_non_event', 'ks', 'max_ks', 'l_cumulative_event', 'cumulative_total', 'l_cumulative_total', 'area', 'cumulative_area', 'area_b', 'pi', 'gini', 'auc']
  ks_report['max_ks'] = np.where(ks_report['max_ks'] == '<----', '<', ks_report['max_ks'])
  ks_report['decile'] = ks_report['decile'] + 5
  ks_report['cumulative_event'] = ks_report['cumulative_event']/100
  ks_report['cumulative_non_event'] = ks_report['cumulative_non_event']/100
  ks_report['ks'] = ks_report['ks']/100
  return ks_report

In [None]:
df_input = spark.read.parquet('/dbfs/FileStore/sahayk/frb_atl_sampling0_202203.parquet', header = False, inferSchema = True)

df_gca = spark.sql('select duns_number as duns_gca from workarea.global_contact where load_year = 2022 and load_month = 2 and load_date = 20220220')

df_gca = df_gca.dropDuplicates(['duns_gca'])

df_wblinkage = spark.sql('select duns_nbr, hq_par_duns_nbr, stat_code, subs_code from workarea.wblinkage where load_year = 2022 and load_month = 2')

df_wblinkage = df_wblinkage.withColumn('branch_location_indicator', when((col('stat_code') == 2) & (col('subs_code') == 0), 1).otherwise(0))

df_wblinkage = df_wblinkage.withColumn('single_site_subsidiaries_indicator', when((col('stat_code') == 0) & (col('subs_code') == 3), 1).otherwise(0))

df_gca2 = df_gca.join(df_wblinkage, col('duns_gca') == col('duns_nbr'), how = 'left')

df_gca2 = df_gca2.withColumn('duns_gca2', when((col('branch_location_indicator') == 1) | (col('single_site_subsidiaries_indicator') == 1), col('hq_par_duns_nbr')).otherwise(col('duns_gca')))

df_gca3 = df_gca2.dropDuplicates(['duns_gca2'])

df_gca3 = df_gca3[['duns_gca2']]

df_output = df_input.join(df_gca3, col('duns1') == col('duns_gca2'), how = 'inner')

df_output.write.mode('overwrite').parquet('/dbfs/FileStore/sahayk/mark_seiss_files/frb_atl_sampling0_202203_output.parquet')

df_output = spark.read.parquet('/dbfs/FileStore/sahayk/mark_seiss_files/frb_atl_sampling0_202203_output.parquet')

df_output.write.mode('overwrite').saveAsTable('workarea.frb_atl_sampling_202203')

gca = spark.sql('select duns_number as duns, jobtitle, gca_jobtitlefunctionnames, gca_jobtitlelevelnames, gca_primaryjobfunctionname from workarea.global_contact where load_year = 2020 and load_month = 3')
gca.write.mode('overwrite').csv('/dbfs/FileStore/sahayk/us_export_propensity/us_export_propensity_job_title_data.csv')

gca = spark.read.csv('/dbfs/FileStore/sahayk/us_export_propensity/us_export_propensity_job_title_data.csv', inferSchema = True, header = False)\
.withColumnRenamed('_c0', 'duns')\
.withColumnRenamed('_c1', 'job_title')\
.withColumnRenamed('_c2', 'job_function')\
.withColumnRenamed('_c3', 'job_title_level')\
.withColumnRenamed('_c4', 'primary_job_function')

gca = gca.withColumn('match_ind', lit(1))

train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
train2 = train.join(gca, on = ['duns'], how = 'left')
train2 = train2.withColumn('match_ind', when(col('match_ind').isNull(), 0).otherwise(col('match_ind')))
train2 = train2.withColumn('job_title_cleaned', ltrim(rtrim(lower(col('job_title')))))
train2 = train2.withColumn('export_job_title_ind', when((col('job_title_cleaned').contains(' carrier')) |
                                                   (col('job_title_cleaned').contains(' distribut')) |
                                                   (col('job_title_cleaned').contains(' countri')) |
                                                   (col('job_title_cleaned').contains(' asia ')) |
                                                   (col('job_title_cleaned').contains(' global ')) |
                                                   (col('job_title_cleaned').contains(' deliveri')) |
                                                   (col('job_title_cleaned').contains(' trade'))
                                                   , 1).otherwise(0))
export_job_title = train2.groupBy('duns').agg(max('export_job_title_ind').alias('export_job_title_ind'))
train3 = train.join(export_job_title, on = ['duns'], how = 'left')

val = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')
val2 = val.join(gca, on = ['duns'], how = 'left')
val2 = val2.withColumn('match_ind', when(col('match_ind').isNull(), 0).otherwise(col('match_ind')))
val2 = val2.withColumn('job_title_cleaned', ltrim(rtrim(lower(col('job_title')))))
val2 = val2.withColumn('export_job_title_ind', when((col('job_title_cleaned').contains(' carrier')) |
                                                   (col('job_title_cleaned').contains(' distribut')) |
                                                   (col('job_title_cleaned').contains(' countri')) |
                                                   (col('job_title_cleaned').contains(' asia ')) |
                                                   (col('job_title_cleaned').contains(' global ')) |
                                                   (col('job_title_cleaned').contains(' deliveri')) |
                                                   (col('job_title_cleaned').contains(' trade'))
                                                   , 1).otherwise(0))
export_job_title = val2.groupBy('duns').agg(max('export_job_title_ind').alias('export_job_title_ind'))
val3 = val.join(export_job_title, on = ['duns'], how = 'left')

test = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test')
test2 = test.join(gca, on = ['duns'], how = 'left')
test2 = test2.withColumn('match_ind', when(col('match_ind').isNull(), 0).otherwise(col('match_ind')))
test2 = test2.withColumn('job_title_cleaned', ltrim(rtrim(lower(col('job_title')))))
test2 = test2.withColumn('export_job_title_ind', when((col('job_title_cleaned').contains(' carrier')) |
                                                   (col('job_title_cleaned').contains(' distribut')) |
                                                   (col('job_title_cleaned').contains(' countri')) |
                                                   (col('job_title_cleaned').contains(' asia ')) |
                                                   (col('job_title_cleaned').contains(' global ')) |
                                                   (col('job_title_cleaned').contains(' deliveri')) |
                                                   (col('job_title_cleaned').contains(' trade'))
                                                   , 1).otherwise(0))
export_job_title = test2.groupBy('duns').agg(max('export_job_title_ind').alias('export_job_title_ind'))
test3 = test.join(export_job_title, on = ['duns'], how = 'left')

train3.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_train_job_title_derived_attribute')
val3.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_val_job_title_derived_attribute')
test3.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_test_job_title_derived_attribute')

train4 = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train_job_title_derived_attribute')
val4 = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val_job_title_derived_attribute')
test4 = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test_job_title_derived_attribute')

In [None]:
train_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
val_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')
test_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test')
train_original2 = train_original.join(train4, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val_original2 = val_original.join(val4, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test_original2 = test_original.join(test4, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
train_original3 = train_original2.toPandas()
val_original3 = val_original2.toPandas()
test_original3 = test_original2.toPandas()
train_original4 = train_original3[~train_original3['export_job_title_ind'].isna()]
val_original4 = val_original3[~val_original3['export_job_title_ind'].isna()]
test_original4 = test_original3[~test_original3['export_job_title_ind'].isna()]
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_original4[['export_job_title_ind']], train_original4[['export']])
train_original4['predicted_export'] = clf.predict_proba(train_original4[['export_job_title_ind']])[:,1]
val_original4['predicted_export'] = clf.predict_proba(val_original4[['export_job_title_ind']])[:,1]
test_original4['predicted_export'] = clf.predict_proba(test_original4[['export_job_title_ind']])[:,1]
train_original4['weight'] = 1
val_original4['weight'] = 1
test_original4['weight'] = 1

ks = metrics(None, None, None, None, None, None)
ks_report_train = ks.KS_train(train_original4['export'], train_original4['predicted_export'], train_original4['weight'], bins = 10)
ks_report_val = ks.KS_train(val_original4['export'], val_original4['predicted_export'], val_original4['weight'], bins = 10)
ks_report_test = ks.KS_train(test_original4['export'], test_original4['predicted_export'], test_original4['weight'], bins = 10)

ks_report_train = formatKSReport(ks_report_train)
ks_report_val = formatKSReport(ks_report_val)
ks_report_test = formatKSReport(ks_report_test)

In [None]:
display(ks_report_train)

display(ks_report_val)

display(train4.groupBy('export_job_title_ind').agg(mean('export')))

display(val4.groupBy('export_job_title_ind').agg(mean('export')))

display(train4.groupBy('export_job_title_ind').count())

display(val4.groupBy('export_job_title_ind').count())