In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from sklearn.linear_model import LogisticRegression

In [None]:
%run /users/sahayk/risk_ml_modeling_framework/model_evaluation

In [None]:
def formatKSReport(ks_report):
  ks_report.columns = ['decile', 'min', 'max', 'event', 'non_event', 'total', 'event_rate', 'cumulative_event', 'cumulative_non_event', 'ks', 'max_ks', 'l_cumulative_event', 'cumulative_total', 'l_cumulative_total', 'area', 'cumulative_area', 'area_b', 'pi', 'gini', 'auc']
  ks_report['max_ks'] = np.where(ks_report['max_ks'] == '<----', '<', ks_report['max_ks'])
  ks_report['decile'] = ks_report['decile'] + 5
  ks_report['cumulative_event'] = ks_report['cumulative_event']/100
  ks_report['cumulative_non_event'] = ks_report['cumulative_non_event']/100
  ks_report['ks'] = ks_report['ks']/100
  return ks_report

In [None]:
smad = spark.sql('select duns_nbr as duns, prim_sic_code as sic4, load_year as append_year, load_month as append_month from workarea.smad where load_year in (2016, 2017) and load_month = 9')
smad2 = smad[['duns', 'sic4', 'append_year', 'append_month']].dropDuplicates(['duns', 'append_year', 'append_month'])

train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
train2 = train.join(smad2, on = ['duns', 'append_year', 'append_month'], how = 'inner')
train3 = train2.where('sic4 > 0')

val = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')
val2 = val.join(smad2, on = ['duns', 'append_year', 'append_month'], how = 'inner')
val3 = val2.where('sic4 > 0')

test = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test')
test2 = test.join(smad2, on = ['duns', 'append_year', 'append_month'], how = 'inner')
test3 = test2.where('sic4 > 0')

train3 = train3.withColumn('sic2', (train3.sic4.cast(IntegerType()) / 100).cast('int'))
train3 = train3.withColumn('sic2d', lit(11))
train3 = train3.withColumn('sic2d', when((col('sic2') > 0) & (col('sic2') <= 9), 1).otherwise(col('sic2d'))) # Agriculture, Forestry, Fishing
train3 = train3.withColumn('sic2d', when((col('sic2') > 9) & (col('sic2') <= 14), 2).otherwise(col('sic2d'))) # Mining
train3 = train3.withColumn('sic2d', when((col('sic2') > 14) & (col('sic2') <= 17), 3).otherwise(col('sic2d'))) # Construction
train3 = train3.withColumn('sic2d', when((col('sic2') > 19) & (col('sic2') <= 39), 4).otherwise(col('sic2d'))) # Manufacturing
train3 = train3.withColumn('sic2d', when((col('sic2') > 39) & (col('sic2') <= 49), 5).otherwise(col('sic2d'))) # Transportation & Public Utilities
train3 = train3.withColumn('sic2d', when((col('sic2') > 49) & (col('sic2') <= 51), 6).otherwise(col('sic2d'))) # Wholesale Trade
train3 = train3.withColumn('sic2d', when((col('sic2') > 51) & (col('sic2') <= 59), 7).otherwise(col('sic2d'))) # Retail Trade
train3 = train3.withColumn('sic2d', when((col('sic2') > 59) & (col('sic2') <= 67), 8).otherwise(col('sic2d'))) # Finance, Insurance, Real Estate
train3 = train3.withColumn('sic2d', when((col('sic2') > 69) & (col('sic2') <= 89), 9).otherwise(col('sic2d'))) # Services
train3 = train3.withColumn('sic2d', when((col('sic2') > 90) & (col('sic2') <= 97), 0).otherwise(col('sic2d'))) # Public Administration

val3 = val3.withColumn('sic2', (val3.sic4.cast(IntegerType()) / 100).cast('int'))
val3 = val3.withColumn('sic2d', lit(11))
val3 = val3.withColumn('sic2d', when((col('sic2') > 0) & (col('sic2') <= 9), 1).otherwise(col('sic2d'))) # Agriculture, Forestry, Fishing
val3 = val3.withColumn('sic2d', when((col('sic2') > 9) & (col('sic2') <= 14), 2).otherwise(col('sic2d'))) # Mining
val3 = val3.withColumn('sic2d', when((col('sic2') > 14) & (col('sic2') <= 17), 3).otherwise(col('sic2d'))) # Construction
val3 = val3.withColumn('sic2d', when((col('sic2') > 19) & (col('sic2') <= 39), 4).otherwise(col('sic2d'))) # Manufacturing
val3 = val3.withColumn('sic2d', when((col('sic2') > 39) & (col('sic2') <= 49), 5).otherwise(col('sic2d'))) # Transportation & Public Utilities
val3 = val3.withColumn('sic2d', when((col('sic2') > 49) & (col('sic2') <= 51), 6).otherwise(col('sic2d'))) # Wholesale Trade
val3 = val3.withColumn('sic2d', when((col('sic2') > 51) & (col('sic2') <= 59), 7).otherwise(col('sic2d'))) # Retail Trade
val3 = val3.withColumn('sic2d', when((col('sic2') > 59) & (col('sic2') <= 67), 8).otherwise(col('sic2d'))) # Finance, Insurance, Real Estate
val3 = val3.withColumn('sic2d', when((col('sic2') > 69) & (col('sic2') <= 89), 9).otherwise(col('sic2d'))) # Services
val3 = val3.withColumn('sic2d', when((col('sic2') > 90) & (col('sic2') <= 97), 0).otherwise(col('sic2d'))) # Public Administration

test3 = test3.withColumn('sic2', (test3.sic4.cast(IntegerType()) / 100).cast('int'))
test3 = test3.withColumn('sic2d', lit(11))
test3 = test3.withColumn('sic2d', when((col('sic2') > 0) & (col('sic2') <= 9), 1).otherwise(col('sic2d'))) # Agriculture, Forestry, Fishing
test3 = test3.withColumn('sic2d', when((col('sic2') > 9) & (col('sic2') <= 14), 2).otherwise(col('sic2d'))) # Mining
test3 = test3.withColumn('sic2d', when((col('sic2') > 14) & (col('sic2') <= 17), 3).otherwise(col('sic2d'))) # Construction
test3 = test3.withColumn('sic2d', when((col('sic2') > 19) & (col('sic2') <= 39), 4).otherwise(col('sic2d'))) # Manufacturing
test3 = test3.withColumn('sic2d', when((col('sic2') > 39) & (col('sic2') <= 49), 5).otherwise(col('sic2d'))) # Transportation & Public Utilities
test3 = test3.withColumn('sic2d', when((col('sic2') > 49) & (col('sic2') <= 51), 6).otherwise(col('sic2d'))) # Wholesale Trade
test3 = test3.withColumn('sic2d', when((col('sic2') > 51) & (col('sic2') <= 59), 7).otherwise(col('sic2d'))) # Retail Trade
test3 = test3.withColumn('sic2d', when((col('sic2') > 59) & (col('sic2') <= 67), 8).otherwise(col('sic2d'))) # Finance, Insurance, Real Estate
test3 = test3.withColumn('sic2d', when((col('sic2') > 69) & (col('sic2') <= 89), 9).otherwise(col('sic2d'))) # Services
test3 = test3.withColumn('sic2d', when((col('sic2') > 90) & (col('sic2') <= 97), 0).otherwise(col('sic2d'))) # Public Administration

sic2_group = train3.groupBy('sic2').agg(sum('export').alias('sic2_sum'), count('duns').alias('sic2_count'))

sic2d_group = train3.groupBy('sic2d').agg(sum('export').alias('sic2d_sum'), count('duns').alias('sic2d_count'))

sic4_group = train3.groupBy(['sic4', 'sic2', 'sic2d']).agg(sum('export').alias('sic4_sum'), count('duns').alias('sic4_count'))

sic4_sic2_group = sic4_group.join(sic2_group, on = ['sic2'], how = 'inner')

sic4_sic2_sic2d_group = sic4_sic2_group.join(sic2d_group, on = ['sic2d'], how = 'inner')

sic4_sic2_sic2d_group = sic4_sic2_sic2d_group.withColumn('sic4_export_rate', col('sic4_sum')/col('sic4_count'))
sic4_sic2_sic2d_group = sic4_sic2_sic2d_group.withColumn('sic2_export_rate', col('sic2_sum')/col('sic2_count'))
sic4_sic2_sic2d_group = sic4_sic2_sic2d_group.withColumn('sic2d_export_rate', col('sic2d_sum')/col('sic2d_count'))

df = sic4_sic2_sic2d_group

df = df.withColumn('control', lit(100))

df = df.withColumn('pre_alpha0', col('sic2d_sum')/col('control'))
df = df.withColumn('pre_beta0', (col('sic2d_count')/col('control')) + (-(col('sic2d_sum')/col('control'))))

df = df.withColumn('pre_alpha', (col('sic2_sum') + col('pre_alpha0'))/col('control'))
df = df.withColumn('pre_beta', (col('pre_beta0') + (col('sic2_count') - col('sic2_sum')))/col('control'))

df = df.withColumn('post_alpha', col('pre_alpha') + col('sic4_sum'))
df = df.withColumn('post_beta', col('pre_beta') + col('sic4_count') - col('sic4_sum'))

df = df.withColumn('posterior_probability', col('post_alpha')/(col('post_alpha') + col('post_beta')))

sic4_score = df[['sic4', 'sic2', 'sic2d', 'posterior_probability']]
sic4_score = sic4_score.withColumnRenamed('posterior_probability', 'sic4_score')

train4 = train3.join(sic4_score, on = ['sic4', 'sic2', 'sic2d'], how = 'inner')

train4.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_train_sic4_score_derived_attribute')

df = df.withColumnRenamed('posterior_probability', 'sic4_score')

df.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_sic4_score_mapping')

sic4_score = spark.sql('select * from workarea.us_export_propensity_sic4_score_mapping')

val4 = val3.join(sic4_score[['sic4', 'sic2', 'sic2d', 'sic4_score']], on = ['sic4', 'sic2', 'sic2d'], how = 'inner')

test4 = test3.join(sic4_score[['sic4', 'sic2', 'sic2d', 'sic4_score']], on = ['sic4', 'sic2', 'sic2d'], how = 'inner')

val4.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_val_sic4_score_derived_attribute')

test4.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_test_sic4_score_derived_attribute')

train_df = spark.sql('select duns, sic4_score, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_train_sic4_score_derived_attribute')
val_df = spark.sql('select duns, sic4_score, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_val_sic4_score_derived_attribute')

train_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
val_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')

train_original2 = train_original.join(train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val_original2 = val_original.join(val_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train_original3 = train_original2.toPandas()
val_original3 = val_original2.toPandas()

train_original4 = train_original3[~train_original3['sic4_score'].isna()]
val_original4 = val_original3[~val_original3['sic4_score'].isna()]

clf = LogisticRegression(random_state=0).fit(train_original4[['sic4_score']], train_original4[['export']])

train_original4['predicted_export'] = clf.predict_proba(train_original4[['sic4_score']])[:,1]
val_original4['predicted_export'] = clf.predict_proba(val_original4[['sic4_score']])[:,1]

train_original4['weight'] = 1
val_original4['weight'] = 1

In [None]:
ks = metrics(None, None, None, None, None, None)
ks_report_train = ks.KS_train(train_original4['export'], train_original4['predicted_export'], train_original4['weight'], bins = 10)
ks_report_val = ks.KS_train(val_original4['export'], val_original4['predicted_export'], val_original4['weight'], bins = 10)
ks_report_train = formatKSReport(ks_report_train)
ks_report_val = formatKSReport(ks_report_val)

In [None]:
display(ks_report_train)

In [None]:
display(ks_report_val)