In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import re

In [None]:
%run /users/sahayk@dnb.com/risk_ml_modeling_framework/model_evaluation

In [None]:
def formatKSReport(ks_report):
  ks_report.columns = ['decile', 'min', 'max', 'event', 'non_event', 'total', 'event_rate', 'cumulative_event', 'cumulative_non_event', 'ks', 'max_ks', 'l_cumulative_event', 'cumulative_total', 'l_cumulative_total', 'area', 'cumulative_area', 'area_b', 'pi', 'gini', 'auc']
  ks_report['max_ks'] = np.where(ks_report['max_ks'] == '<----', '<', ks_report['max_ks'])
  ks_report['decile'] = ks_report['decile'] + 5
  ks_report['cumulative_event'] = ks_report['cumulative_event']/100
  ks_report['cumulative_non_event'] = ks_report['cumulative_non_event']/100
  ks_report['ks'] = ks_report['ks']/100
  return ks_report

@pandas_udf(StringType(), PandasUDFType.SCALAR)
def cleanBusinessName(col): 
    return col.map(lambda x: re.sub(r'[^a-z0-9 ]', '', x))

In [None]:
gdmi = spark.sql('select duns_number as duns, business_name, load_year as append_year, load_month as append_month from workarea.gdmi3100 where load_year in (2016, 2017) and load_month = 9')
gdmi = gdmi[['duns', 'business_name', 'append_year', 'append_month']].dropDuplicates(['duns', 'append_year', 'append_month'])

train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
train2 = train.join(gdmi, on = ['duns', 'append_year', 'append_month'], how = 'left')

val = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')
val2 = val.join(gdmi, on = ['duns', 'append_year', 'append_month'], how = 'left')

test = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test')
test2 = test.join(gdmi, on = ['duns', 'append_year', 'append_month'], how = 'left')

train2 = train2.withColumn('business_name_cleaned', ltrim(rtrim(lower(col('business_name')))))
val2 = val2.withColumn('business_name_cleaned', ltrim(rtrim(lower(col('business_name')))))
test2 = test2.withColumn('business_name_cleaned', ltrim(rtrim(lower(col('business_name')))))

train3 = train2.where('business_name_cleaned is not null')
val3 = val2.where('business_name_cleaned is not null')
test3 = test2.where('business_name_cleaned is not null')

train3 = train3.withColumn('business_name_cleaned2', cleanBusinessName(col('business_name_cleaned')))
val3 = val3.withColumn('business_name_cleaned2', cleanBusinessName(col('business_name_cleaned')))
test3 = test3.withColumn('business_name_cleaned2', cleanBusinessName(col('business_name_cleaned')))

train3 = train3.withColumn('exp_bus_nme_ind', lit(0))

val3 = val3.withColumn('exp_bus_nme_ind', lit(0))

test3 = test3.withColumn('exp_bus_nme_ind', lit(0))

train3 = train3.withColumn('exp_bus_nme_ind', when((col('business_name_cleaned2').contains(' church')) |
                                                   (col('business_name_cleaned2').contains(' realti')) |
                                                   (col('business_name_cleaned2').contains(' properti')) |
                                                   (col('business_name_cleaned2').contains(' transport')) |
                                                   (col('business_name_cleaned2').contains(' store ')) |
                                                   (col('business_name_cleaned2').contains(' home ')) |
                                                   (col('business_name_cleaned2').contains(' care ')) |
                                                   (col('business_name_cleaned2').contains(' invest')) |
                                                   (col('business_name_cleaned2').contains(' construct')) |
                                                   (col('business_name_cleaned2').contains(' express')) |
                                                   (col('business_name_cleaned2').contains(' health ')) |
                                                   (col('business_name_cleaned2').contains(' school ')) |
                                                   (col('business_name_cleaned2').contains(' college ')) |
                                                   (col('business_name_cleaned2').contains(' university '))
                                                   , -1).otherwise(col('exp_bus_nme_ind')))

val3 = val3.withColumn('exp_bus_nme_ind', when((col('business_name_cleaned2').contains(' church')) |
                                                   (col('business_name_cleaned2').contains(' realti')) |
                                                   (col('business_name_cleaned2').contains(' properti')) |
                                                   (col('business_name_cleaned2').contains(' transport')) |
                                                   (col('business_name_cleaned2').contains(' store ')) |
                                                   (col('business_name_cleaned2').contains(' home ')) |
                                                   (col('business_name_cleaned2').contains(' care ')) |
                                                   (col('business_name_cleaned2').contains(' invest')) |
                                                   (col('business_name_cleaned2').contains(' construct')) |
                                                   (col('business_name_cleaned2').contains(' express')) |
                                                   (col('business_name_cleaned2').contains(' health ')) |
                                                   (col('business_name_cleaned2').contains(' school ')) |
                                                   (col('business_name_cleaned2').contains(' college ')) |
                                                   (col('business_name_cleaned2').contains(' university '))
                                                   , -1).otherwise(col('exp_bus_nme_ind')))

test3 = test3.withColumn('exp_bus_nme_ind', when((col('business_name_cleaned2').contains(' church')) |
                                                   (col('business_name_cleaned2').contains(' realti')) |
                                                   (col('business_name_cleaned2').contains(' properti')) |
                                                   (col('business_name_cleaned2').contains(' transport')) |
                                                   (col('business_name_cleaned2').contains(' store ')) |
                                                   (col('business_name_cleaned2').contains(' home ')) |
                                                   (col('business_name_cleaned2').contains(' care ')) |
                                                   (col('business_name_cleaned2').contains(' invest')) |
                                                   (col('business_name_cleaned2').contains(' construct')) |
                                                   (col('business_name_cleaned2').contains(' express')) |
                                                   (col('business_name_cleaned2').contains(' health ')) |
                                                   (col('business_name_cleaned2').contains(' school ')) |
                                                   (col('business_name_cleaned2').contains(' college ')) |
                                                   (col('business_name_cleaned2').contains(' university '))
                                                   , -1).otherwise(col('exp_bus_nme_ind')))

train3 = train3.withColumn('exp_bus_nme_ind', when((col('business_name_cleaned2').contains(' solut')) |
                                                   (col('business_name_cleaned2').contains(' corp')) |
                                                   (col('business_name_cleaned2').contains(' american ')) |
                                                   (col('business_name_cleaned2').contains(' capit')) |
                                                   (col('business_name_cleaned2').contains(' inc')) |
                                                   (col('business_name_cleaned2').contains(' trade ')) |
                                                   (col('business_name_cleaned2').contains(' compani')) |
                                                   (col('business_name_cleaned2').contains(' incorp')) |
                                                   (col('business_name_cleaned2').contains(' product')) |
                                                   (col('business_name_cleaned2').contains(' advanc')) |
                                                   (col('business_name_cleaned2').contains(' global ')) |
                                                   (col('business_name_cleaned2').contains(' engin')) |
                                                   (col('business_name_cleaned2').contains(' machin')) |
                                                   (col('business_name_cleaned2').contains(' america ')) |
                                                   (col('business_name_cleaned2').contains(' usa ')) |
                                                   (col('business_name_cleaned2').contains(' export ')) |
                                                   (col('business_name_cleaned2').contains(' marin')) |
                                                   (col('business_name_cleaned2').contains(' corpor')) |
                                                   (col('business_name_cleaned2').contains(' aviat')) |
                                                   (col('business_name_cleaned2').contains(' tool ')) |
                                                   (col('business_name_cleaned2').contains(' wire ')) |
                                                   (col('business_name_cleaned2').contains(' energi')) |
                                                   (col('business_name_cleaned2').contains(' industri')) |
                                                   (col('business_name_cleaned2').contains(' econom')) |
                                                   (col('business_name_cleaned2').contains(' univers')) |
                                                   (col('business_name_cleaned2').contains(' system ')) |
                                                   (col('business_name_cleaned2').contains(' technolog')) |
                                                   (col('business_name_cleaned2').contains(' research ')) |
                                                   (col('business_name_cleaned2').contains(' commerc')) |
                                                   (col('business_name_cleaned2').contains(' worldwid')) |
                                                   (col('business_name_cleaned2').contains(' manufactur')) |
                                                   (col('business_name_cleaned2').contains(' precis')) |
                                                   (col('business_name_cleaned2').contains(' electron')) |
                                                   (col('business_name_cleaned2').contains(' aerospac')) |
                                                   (col('business_name_cleaned2').contains(' scientif')) |
                                                   (col('business_name_cleaned2').contains(' devic'))
                                                   , 1).otherwise(col('exp_bus_nme_ind')))

val3 = val3.withColumn('exp_bus_nme_ind', when((col('business_name_cleaned2').contains(' solut')) |
                                                   (col('business_name_cleaned2').contains(' corp')) |
                                                   (col('business_name_cleaned2').contains(' american ')) |
                                                   (col('business_name_cleaned2').contains(' capit')) |
                                                   (col('business_name_cleaned2').contains(' inc')) |
                                                   (col('business_name_cleaned2').contains(' trade ')) |
                                                   (col('business_name_cleaned2').contains(' compani')) |
                                                   (col('business_name_cleaned2').contains(' incorp')) |
                                                   (col('business_name_cleaned2').contains(' product')) |
                                                   (col('business_name_cleaned2').contains(' advanc')) |
                                                   (col('business_name_cleaned2').contains(' global ')) |
                                                   (col('business_name_cleaned2').contains(' engin')) |
                                                   (col('business_name_cleaned2').contains(' machin')) |
                                                   (col('business_name_cleaned2').contains(' america ')) |
                                                   (col('business_name_cleaned2').contains(' usa ')) |
                                                   (col('business_name_cleaned2').contains(' export ')) |
                                                   (col('business_name_cleaned2').contains(' marin')) |
                                                   (col('business_name_cleaned2').contains(' corpor')) |
                                                   (col('business_name_cleaned2').contains(' aviat')) |
                                                   (col('business_name_cleaned2').contains(' tool ')) |
                                                   (col('business_name_cleaned2').contains(' wire ')) |
                                                   (col('business_name_cleaned2').contains(' energi')) |
                                                   (col('business_name_cleaned2').contains(' industri')) |
                                                   (col('business_name_cleaned2').contains(' econom')) |
                                                   (col('business_name_cleaned2').contains(' univers')) |
                                                   (col('business_name_cleaned2').contains(' system ')) |
                                                   (col('business_name_cleaned2').contains(' technolog')) |
                                                   (col('business_name_cleaned2').contains(' research ')) |
                                                   (col('business_name_cleaned2').contains(' commerc')) |
                                                   (col('business_name_cleaned2').contains(' worldwid')) |
                                                   (col('business_name_cleaned2').contains(' manufactur')) |
                                                   (col('business_name_cleaned2').contains(' precis')) |
                                                   (col('business_name_cleaned2').contains(' electron')) |
                                                   (col('business_name_cleaned2').contains(' aerospac')) |
                                                   (col('business_name_cleaned2').contains(' scientif')) |
                                                   (col('business_name_cleaned2').contains(' devic'))
                                                   , 1).otherwise(col('exp_bus_nme_ind')))

test3 = test3.withColumn('exp_bus_nme_ind', when((col('business_name_cleaned2').contains(' solut')) |
                                                   (col('business_name_cleaned2').contains(' corp')) |
                                                   (col('business_name_cleaned2').contains(' american ')) |
                                                   (col('business_name_cleaned2').contains(' capit')) |
                                                   (col('business_name_cleaned2').contains(' inc')) |
                                                   (col('business_name_cleaned2').contains(' trade ')) |
                                                   (col('business_name_cleaned2').contains(' compani')) |
                                                   (col('business_name_cleaned2').contains(' incorp')) |
                                                   (col('business_name_cleaned2').contains(' product')) |
                                                   (col('business_name_cleaned2').contains(' advanc')) |
                                                   (col('business_name_cleaned2').contains(' global ')) |
                                                   (col('business_name_cleaned2').contains(' engin')) |
                                                   (col('business_name_cleaned2').contains(' machin')) |
                                                   (col('business_name_cleaned2').contains(' america ')) |
                                                   (col('business_name_cleaned2').contains(' usa ')) |
                                                   (col('business_name_cleaned2').contains(' export ')) |
                                                   (col('business_name_cleaned2').contains(' marin')) |
                                                   (col('business_name_cleaned2').contains(' corpor')) |
                                                   (col('business_name_cleaned2').contains(' aviat')) |
                                                   (col('business_name_cleaned2').contains(' tool ')) |
                                                   (col('business_name_cleaned2').contains(' wire ')) |
                                                   (col('business_name_cleaned2').contains(' energi')) |
                                                   (col('business_name_cleaned2').contains(' industri')) |
                                                   (col('business_name_cleaned2').contains(' econom')) |
                                                   (col('business_name_cleaned2').contains(' univers')) |
                                                   (col('business_name_cleaned2').contains(' system ')) |
                                                   (col('business_name_cleaned2').contains(' technolog')) |
                                                   (col('business_name_cleaned2').contains(' research ')) |
                                                   (col('business_name_cleaned2').contains(' commerc')) |
                                                   (col('business_name_cleaned2').contains(' worldwid')) |
                                                   (col('business_name_cleaned2').contains(' manufactur')) |
                                                   (col('business_name_cleaned2').contains(' precis')) |
                                                   (col('business_name_cleaned2').contains(' electron')) |
                                                   (col('business_name_cleaned2').contains(' aerospac')) |
                                                   (col('business_name_cleaned2').contains(' scientif')) |
                                                   (col('business_name_cleaned2').contains(' devic'))
                                                   , 1).otherwise(col('exp_bus_nme_ind')))

train3.write.saveAsTable('workarea.us_export_propensity_analytic_dataset_train_business_name_derived_attribute')

val3.write.saveAsTable('workarea.us_export_propensity_analytic_dataset_val_business_name_derived_attribute')

test3.write.saveAsTable('workarea.us_export_propensity_analytic_dataset_test_business_name_derived_attribute')

train4 = spark.sql('select duns, append_year, append_month, export, exp_bus_nme_ind from workarea.us_export_propensity_analytic_dataset_train_business_name_derived_attribute')

val4 = spark.sql('select duns, append_year, append_month, export, exp_bus_nme_ind from workarea.us_export_propensity_analytic_dataset_val_business_name_derived_attribute')

test4 = spark.sql('select duns, append_year, append_month, export, exp_bus_nme_ind from workarea.us_export_propensity_analytic_dataset_test_business_name_derived_attribute')

In [None]:
train_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
val_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')
test_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test')
train_original2 = train_original.join(train4, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val_original2 = val_original.join(val4, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
test_original2 = test_original.join(test4, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
train_original3 = train_original2.toPandas()
val_original3 = val_original2.toPandas()
test_original3 = test_original2.toPandas()
train_original4 = train_original3[~train_original3['exp_bus_nme_ind'].isna()]
val_original4 = val_original3[~val_original3['exp_bus_nme_ind'].isna()]
test_original4 = test_original3[~test_original3['exp_bus_nme_ind'].isna()]
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_original4[['exp_bus_nme_ind']], train_original4[['export']])
train_original4['predicted_export'] = clf.predict_proba(train_original4[['exp_bus_nme_ind']])[:,1]
val_original4['predicted_export'] = clf.predict_proba(val_original4[['exp_bus_nme_ind']])[:,1]
test_original4['predicted_export'] = clf.predict_proba(test_original4[['exp_bus_nme_ind']])[:,1]
train_original4['weight'] = 1
val_original4['weight'] = 1
test_original4['weight'] = 1

In [None]:
ks = metrics(None, None, None, None, None, None)
ks_report_train = ks.KS_train(train_original4['export'], train_original4['predicted_export'], train_original4['weight'], bins = 10)
ks_report_val = ks.KS_train(val_original4['export'], val_original4['predicted_export'], val_original4['weight'], bins = 10)
ks_report_test = ks.KS_train(test_original4['export'], test_original4['predicted_export'], test_original4['weight'], bins = 10)

ks_report_train = formatKSReport(ks_report_train)
ks_report_val = formatKSReport(ks_report_val)
ks_report_test = formatKSReport(ks_report_test)

In [None]:
display(ks_report_train)

In [None]:
display(ks_report_val)