In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
exporters = spark.sql('select * from workarea.us_export_propensity_exporters')

smad = spark.sql('select duns_nbr as duns, alph_terr_code as state, prim_sic_code as sic4, tot_ann_sls_amt as sales, tot_emp_cnt as total_employees, emp_at_locn_cnt as employees_at_location, load_year as append_year, load_month as append_month from workarea.smad where load_year in (2016, 2017) and load_month = 9')

smad = smad.withColumn('sic2', (smad.sic4.cast(IntegerType()) / 100))
smad = smad.withColumn('sic2d', lit(11))
smad = smad.withColumn('sic2d', when((col('sic2') > 0) & (col('sic2') <= 9), 1).otherwise(col('sic2d'))) # Agriculture, Forestry, Fishing
smad = smad.withColumn('sic2d', when((col('sic2') > 9) & (col('sic2') <= 14), 2).otherwise(col('sic2d'))) # Mining
smad = smad.withColumn('sic2d', when((col('sic2') > 14) & (col('sic2') <= 17), 3).otherwise(col('sic2d'))) # Construction
smad = smad.withColumn('sic2d', when((col('sic2') > 19) & (col('sic2') <= 39), 4).otherwise(col('sic2d'))) # Manufacturing
smad = smad.withColumn('sic2d', when((col('sic2') > 39) & (col('sic2') <= 49), 5).otherwise(col('sic2d'))) # Transportation & Public Utilities
smad = smad.withColumn('sic2d', when((col('sic2') > 49) & (col('sic2') <= 51), 6).otherwise(col('sic2d'))) # Wholesale Trade
smad = smad.withColumn('sic2d', when((col('sic2') > 51) & (col('sic2') <= 59), 7).otherwise(col('sic2d'))) # Retail Trade
smad = smad.withColumn('sic2d', when((col('sic2') > 59) & (col('sic2') <= 67), 8).otherwise(col('sic2d'))) # Finance, Insurance, Real Estate
smad = smad.withColumn('sic2d', when((col('sic2') > 69) & (col('sic2') <= 89), 9).otherwise(col('sic2d'))) # Services
smad = smad.withColumn('sic2d', when((col('sic2') > 90) & (col('sic2') <= 97), 0).otherwise(col('sic2d'))) # Public Administration

exporters = exporters.join(smad, on = ['duns', 'append_year', 'append_month'], how = 'left')

In [None]:
display(exporters.groupBy('sic2d').count())

In [None]:
display(smad.groupBy('sic2d').count())

In [None]:
display(exporters.groupBy('state').count())

In [None]:
display(smad.groupBy('state').count())

In [None]:
exporters = exporters.withColumn('sic2', col('sic2').cast('int'))
smad = smad.withColumn('sic2', col('sic2').cast('int'))

exporters_sic2 = exporters.groupBy('sic2').count().withColumnRenamed('count','exporters_count')
display(exporters.groupBy('sic2').count())

smad_sic2 = smad.groupBy('sic2').count().withColumnRenamed('count','smad_count')
display(smad.groupBy('sic2').count())

exporters_smad_sic2 = exporters_sic2.join(smad_sic2, on = 'sic2', how = 'inner')

In [None]:
display(exporters_smad_sic2.orderBy('sic2'))

In [None]:
exporters_smad_sic2 = exporters_smad_sic2.withColumn('exporters_total', lit(23108))
exporters_smad_sic2 = exporters_smad_sic2.withColumn('smad_total', lit(44381508))

exporters_smad_sic2 = exporters_smad_sic2.withColumn('exporters_percent', col('exporters_count')/col('exporters_total'))
exporters_smad_sic2 = exporters_smad_sic2.withColumn('smad_percent', col('smad_count')/col('smad_total'))

exporters_smad_sic2 = exporters_smad_sic2.withColumn('relativity', col('exporters_percent')/col('smad_percent'))

In [None]:
display(exporters_smad_sic2.orderBy('relativity', ascending = False).limit(5))

In [None]:
display(exporters_smad_sic2.orderBy('relativity', ascending = True).limit(5))

In [None]:
exporters = exporters.withColumn('total_employees_group', when( (col('total_employees')>=1) & (col('total_employees')<=9), '01: 1 - 9' ).\
                otherwise(  when( (col('total_employees')>=10) & (col('total_employees')<=49), '02: 10 - 49' ).\
                otherwise(  when( (col('total_employees')>=50) & (col('total_employees')<=99), '03: 50 - 99' ).\
                otherwise(  when( (col('total_employees')>=100) & (col('total_employees')<=249), '04: 100 - 249' ).\
                otherwise(  when( (col('total_employees')>=250) & (col('total_employees')<=499), '05: 250 - 499' ).\
                otherwise(  when( (col('total_employees')>=500), '06: 500+' ).otherwise('07: N/A')
                         ))))))

exporters = exporters.withColumn('employees_at_location_group', when( (col('employees_at_location')>=1) & (col('employees_at_location')<=9), '01: 1 - 9' ).\
                otherwise(  when( (col('employees_at_location')>=10) & (col('employees_at_location')<=49), '02: 10 - 49' ).\
                otherwise(  when( (col('employees_at_location')>=50) & (col('employees_at_location')<=99), '03: 50 - 99' ).\
                otherwise(  when( (col('employees_at_location')>=100) & (col('employees_at_location')<=249), '04: 100 - 249' ).\
                otherwise(  when( (col('employees_at_location')>=250) & (col('employees_at_location')<=499), '05: 250 - 499' ).\
                otherwise(  when( (col('employees_at_location')>=500), '06: 500+' ).otherwise('07: N/A')
                         ))))))

exporters = exporters.withColumn('sales_group', when( (col('sales')>0) & (col('sales')<=50000), '01:0-50000' ).\
                otherwise(  when( (col('sales')>50000) & (col('sales')<=100000), '02:50000-100000' ).\
                otherwise(  when( (col('sales')>100000) & (col('sales')<=250000), '03:100000-250000' ).\
                otherwise(  when( (col('sales')>250000) & (col('sales')<=1000000), '04:250000-1000000' ).\
                otherwise(  when( (col('sales')>1000000) & (col('sales')<=5000000), '05:1000000-5000000' ).\
                otherwise(  when( (col('sales')>5000000), '06:5000000+' ).otherwise('07: N/A')
                         ))))))

In [None]:
display(exporters.groupBy('total_employees_group').count().orderBy('total_employees_group'))

In [None]:
display(exporters.groupBy('employees_at_location_group').count().orderBy('employees_at_location_group'))

In [None]:
display(exporters.groupBy('sales_group').count().orderBy('sales_group'))

In [None]:
display(exporters.groupBy(['total_employees_group','sales_group']).count().orderBy(['total_employees_group','sales_group']))