In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import KMeansModel
from sklearn.linear_model import LogisticRegression

In [None]:
%run /users/sahayk/risk_ml_modeling_framework/model_evaluation

In [None]:
def posteriorProbability(df, control):
  df = df.withColumn('control', lit(control))
  df = df.withColumn('pre_alpha', col('state_sum')/col('control'))
  df = df.withColumn('pre_beta', (col('state_size') - col('state_sum'))/col('control'))
  df = df.withColumn('post_alpha', col('pre_alpha') + col('cluster_sum'))
  df = df.withColumn('post_beta', col('pre_beta') + col('cluster_size') - col('cluster_sum'))
  df = df.withColumn('posterior_probability', col('post_alpha')/(col('post_alpha') + col('post_beta')))
  return df

In [None]:
def formatKSReport(ks_report):
  ks_report.columns = ['decile', 'min', 'max', 'event', 'non_event', 'total', 'event_rate', 'cumulative_event', 'cumulative_non_event', 'ks', 'max_ks', 'l_cumulative_event', 'cumulative_total', 'l_cumulative_total', 'area', 'cumulative_area', 'area_b', 'pi', 'gini', 'auc']
  ks_report['max_ks'] = np.where(ks_report['max_ks'] == '<----', '<', ks_report['max_ks'])
  ks_report['decile'] = ks_report['decile'] + 5
  ks_report['cumulative_event'] = ks_report['cumulative_event']/100
  ks_report['cumulative_non_event'] = ks_report['cumulative_non_event']/100
  ks_report['ks'] = ks_report['ks']/100
  return ks_report

In [None]:
smad = spark.sql('select duns_nbr as duns, alph_terr_code as state, geo_lat_sign, geo_lat_msmt, geo_long_sign, geo_long_msmt, cast(concat(geo_lat_sign, geo_lat_msmt) as int)/1000000 as latitude, cast(concat(geo_long_sign, geo_long_msmt) as int)/1000000 as longitude, load_year as append_year, load_month as append_month from workarea.smad where load_year in (2016, 2017) and load_month = 9 and geo_lat_sign = "+"')
smad2 = smad[['duns', 'state', 'latitude', 'longitude', 'append_year', 'append_month']].dropDuplicates(['duns', 'append_year', 'append_month'])

train = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
train2 = train.join(smad2, on = ['duns', 'append_year', 'append_month'], how = 'inner')
train3 = train2.where('latitude <> 0 and longitude <> 0')

vecAssembler = VectorAssembler(inputCols = ["latitude", "longitude"], outputCol = "features")

train4 = vecAssembler.transform(train3)

kmeans = KMeans(k = 500, seed = 1)
model = kmeans.fit(train4.select('features'))

model.save('/dbfs/FileStore/sahayk/us_export_propensity/pickles/us_export_propensity_analytic_dataset_train_cluster_model_k500_2.model')

train5 = model.transform(train4) 
train6 = train5[['duns', 'append_year', 'append_month', 'export', 'state', 'latitude', 'longitude', 'prediction']]
train6 = train6.withColumnRenamed('prediction', 'cluster_label')
train6.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_train_location_cluster_k500')

val = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')
val2 = val.join(smad2, on = ['duns', 'append_year', 'append_month'], how = 'inner')
val3 = val2.where('latitude <> 0 and longitude <> 0')

test = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test')
test2 = test.join(smad2, on = ['duns', 'append_year', 'append_month'], how = 'inner')
test3 = test2.where('latitude <> 0 and longitude <> 0')

kmeans = KMeans()
model = KMeansModel.load('/dbfs/FileStore/sahayk/us_export_propensity/pickles/us_export_propensity_analytic_dataset_train_cluster_model_k500_2.model')

vecAssembler = VectorAssembler(inputCols = ["latitude", "longitude"], outputCol = "features")

val4 = vecAssembler.transform(val3)
val5 = model.transform(val4) 
val6 = val5[['duns', 'append_year', 'append_month', 'export', 'state', 'latitude', 'longitude', 'prediction']]
val6 = val6.withColumnRenamed('prediction', 'cluster_label')
val6.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_val_location_cluster_k500')

test4 = vecAssembler.transform(test3)
test5 = model.transform(test4)
test6 = test5[['duns', 'append_year', 'append_month', 'export', 'state', 'latitude', 'longitude', 'prediction']]
test6 = test6.withColumnRenamed('prediction', 'cluster_label')
test6.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_test_location_cluster_k500')

train7 = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train_location_cluster_k500')

cluster_group = train7.groupBy(['cluster_label', 'state']).agg(sum('export').alias('cluster_sum'), count('duns').alias('cluster_size'))
state_group = train7.groupBy(['state']).agg(sum('export').alias('state_sum'), count('duns').alias('state_size'))
cluster_state_group = cluster_group.join(state_group, on = 'state', how = 'inner')
cluster_state_group = cluster_state_group.withColumn('cluster_export_rate', col('cluster_sum')/col('cluster_size'))
cluster_state_group = cluster_state_group.withColumn('state_export_rate', col('state_sum')/col('state_size'))

loc_cluster_score1 = posteriorProbability(cluster_state_group, 100)
loc_cluster_score1 = loc_cluster_score1.withColumnRenamed('posterior_probability', 'location_cluster_score1')

loc_cluster_score1.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_location_cluster_score1_mapping_k500')

train8 = train7.join(loc_cluster_score1[['cluster_label', 'state', 'location_cluster_score1']], on = ['cluster_label', 'state'], how = 'inner')
train9 = train8

train9.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_train_location_cluster_score_derived_attribute_k500')

loc_cluster_score1 = spark.sql('select * from workarea.us_export_propensity_location_cluster_score1_mapping_k500')

val7 = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val_location_cluster_k500')

val8 = val7.join(loc_cluster_score1[['cluster_label', 'state', 'location_cluster_score1']], on = ['cluster_label', 'state'], how = 'inner')
val9 = val8

val9.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_val_location_cluster_score_derived_attribute_k500')

test7 = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_test_location_cluster_k500')

test8 = test7.join(loc_cluster_score1[['cluster_label', 'state', 'location_cluster_score1']], on = ['cluster_label', 'state'], how = 'inner')
test9 = test8

test9.write.mode('overwrite').saveAsTable('workarea.us_export_propensity_analytic_dataset_test_location_cluster_score_derived_attribute_k500')

train_df = spark.sql('select duns, location_cluster_score1, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_train_location_cluster_score_derived_attribute_k500')
val_df = spark.sql('select duns, location_cluster_score1, append_year, append_month, export from workarea.us_export_propensity_analytic_dataset_val_location_cluster_score_derived_attribute_k500')

train_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_train')
val_original = spark.sql('select * from workarea.us_export_propensity_analytic_dataset_val')

train_original2 = train_original.join(train_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')
val_original2 = val_original.join(val_df, on = ['duns', 'append_year', 'append_month', 'export'], how = 'left')

train_original3 = train_original2.toPandas()
val_original3 = val_original2.toPandas()

train_original4 = train_original3[~train_original3['location_cluster_score1'].isna()]
val_original4 = val_original3[~val_original3['location_cluster_score1'].isna()]

clf = LogisticRegression(random_state=0).fit(train_original4[['location_cluster_score1']], train_original4[['export']])

train_original4['predicted_export'] = clf.predict_proba(train_original4[['location_cluster_score1']])[:,1]
val_original4['predicted_export'] = clf.predict_proba(val_original4[['location_cluster_score1']])[:,1]

train_original4['weight'] = 1
val_original4['weight'] = 1

In [None]:
ks = metrics(None, None, None, None, None, None)
ks_report_train = ks.KS_train(train_original4['export'], train_original4['predicted_export'], train_original4['weight'], bins = 10)
ks_report_val = ks.KS_train(val_original4['export'], val_original4['predicted_export'], val_original4['weight'], bins = 10)
ks_report_train = formatKSReport(ks_report_train)
ks_report_val = formatKSReport(ks_report_val)

In [None]:
display(ks_report_train)

In [None]:
display(ks_report_val)