In [1]:
import sys
sys.path.append("../src/")
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import ChiSqSelector, VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import QuantileDiscretizer
# create sparksession
spark = SparkSession \
    .builder \
    .appName("Pysparkexample") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
from config import *

In [2]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [3]:
df = spark.read.parquet("../data/census_income.parquet")

In [4]:
df_test = spark.read.csv("../data/census-income.test", header=False, inferSchema=True)

In [5]:
df_test = df_test.drop("_c24")

In [6]:
df_descr = pd.read_csv("../data/column_name.csv")

In [7]:
columns = list(df_descr["Column Name"])

In [8]:
df_test = df_test.toDF(*columns)

In [9]:
df_test.count(), len(df_test.columns)

(99762, 41)

In [10]:
def count_missings(spark_df,sort=True):
    df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for c in spark_df.columns]).toPandas()

    if len(df) == 0:
        print("There are no any missing values!")
        return None

    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df

In [11]:
count_missings(df_test)

Unnamed: 0,count
age,0
state_prev_res,0
det_hh_summ,0
mig_chg_msa,0
mig_chg_reg,0
mig_move_reg,0
mig_same,0
mig_prev_sunbelt,0
num_emp,0
fam_under_18,0


In [12]:
dict_unique = pd.read_parquet("../reports/unique_values.parquet").set_index("Var").to_dict()["Unique"]

In [13]:
for col in ['age', 'wage_per_hour', 'capital_gains', 'capital_losses', 'stock_dividends', 'weeks_worked']:
    qd = QuantileDiscretizer(numBuckets=20, inputCol=col,outputCol=col + "_q").fit(df)
    df = qd.transform(df)
    df = df.drop(col)
    df_test = qd.transform(df_test)
    df_test = df_test.drop(col)

In [14]:
string_cols = [col for col, dtype in df.dtypes if dtype=="string"]
indexers = [StringIndexer(inputCol=col, outputCol=col+"_ind").fit(df) for col in string_cols]

In [15]:
pipeline = Pipeline(stages=indexers)
p = pipeline.fit(df)
df = p.transform(df)
df_test = p.transform(df_test)

In [16]:
for col in string_cols:
    df = df.drop(col)
    df_test = df_test.drop(col)

In [17]:
# put features into a feature vector column
assembler = VectorAssembler(inputCols=[col for col in df.columns if col != "income_50k_ind"], 
                            outputCol="features") 

In [18]:
df = assembler.transform(df)
df_test = assembler.transform(df_test)

In [19]:
selector = ChiSqSelector(featuresCol="features", selectorType="fdr",
                         outputCol="selectedFeatures", labelCol="income_50k_ind")
s = selector.fit(df)
df = s.transform(df)
df_test = s.transform(df_test)

In [20]:
ss = StandardScaler(inputCol="selectedFeatures", outputCol="stdSelectedFeatures")

In [21]:
ss = ss.fit(df)
df = ss.transform(df)
df_test = ss.transform(df_test)

In [22]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="income_50k_ind", featuresCol="stdSelectedFeatures", seed=101)

In [23]:
rf = rf.fit(df)

In [24]:
df = rf.transform(df)

In [25]:
df.limit(5).toPandas()

Unnamed: 0,det_ind_code,det_occ_code,num_emp,own_or_self,vet_benefits,year,age_q,wage_per_hour_q,capital_gains_q,capital_losses_q,stock_dividends_q,weeks_worked_q,class_worker_ind,education_ind,hs_college_ind,marital_stat_ind,major_ind_code_ind,major_occ_code_ind,race_ind,hisp_origin_ind,sex_ind,union_member_ind,unemp_reason_ind,full_or_part_emp_ind,tax_filer_stat_ind,region_prev_res_ind,state_prev_res_ind,det_hh_fam_stat_ind,det_hh_summ_ind,mig_chg_msa_ind,mig_chg_reg_ind,mig_move_reg_ind,mig_same_ind,mig_prev_sunbelt_ind,fam_under_18_ind,country_father_ind,country_mother_ind,country_self_ind,citizenship_ind,vet_question_ind,income_50k_ind,features,selectedFeatures,stdSelectedFeatures,rawPrediction,probability,prediction
0,37.0,38.0,2.0,0.0,2.0,95.0,16.0,1.0,1.0,1.0,2.0,5.0,1.0,5.0,0.0,2.0,7.0,9.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,"(37.0, 38.0, 2.0, 0.0, 2.0, 95.0, 16.0, 1.0, 1...","(37.0, 38.0, 2.0, 0.0, 2.0, 95.0, 16.0, 1.0, 2...","(2.047918095562956, 2.628992936581328, 0.84562...","[15.485695552035695, 4.514304447964303]","[0.7742847776017847, 0.22571522239821515]",0.0
1,0.0,0.0,0.0,0.0,0.0,94.0,4.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 94.0, 4.0, 1.0, 1.0,...","(0.0, 0.0, 0.0, 0.0, 0.0, 94.0, 4.0, 1.0, 1.0,...","(0.0, 0.0, 0.0, 0.0, 0.0, 187.99956939709733, ...","[19.603527372992687, 0.3964726270073165]","[0.9801763686496342, 0.01982363135036582]",0.0
2,0.0,0.0,0.0,0.0,0.0,94.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 94.0, 3.0, 1.0, 1.0,...","(0.0, 0.0, 0.0, 0.0, 0.0, 94.0, 3.0, 1.0, 1.0,...","(0.0, 0.0, 0.0, 0.0, 0.0, 187.99956939709733, ...","[19.603527372992687, 0.3964726270073165]","[0.9801763686496342, 0.01982363135036582]",0.0
3,0.0,0.0,0.0,0.0,2.0,94.0,17.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(0.0, 0.0, 0.0, 0.0, 2.0, 94.0, 17.0, 1.0, 1.0...","(0.0, 0.0, 0.0, 0.0, 2.0, 94.0, 17.0, 1.0, 1.0...","(0.0, 0.0, 0.0, 0.0, 2.3488694944822215, 187.9...","[19.295450040276823, 0.7045499597231798]","[0.9647725020138409, 0.035227497986158984]",0.0
4,32.0,42.0,5.0,2.0,2.0,94.0,6.0,1.0,1.0,1.0,1.0,4.0,1.0,0.0,0.0,1.0,13.0,8.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,32.0,2.0,2.0,8.0,6.0,6.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[32.0, 42.0, 5.0, 2.0, 2.0, 94.0, 6.0, 1.0, 1....","[32.0, 42.0, 5.0, 2.0, 2.0, 94.0, 6.0, 1.0, 1....","[1.7711724069733672, 2.905729035168836, 2.1140...","[18.719835832618337, 1.2801641673816637]","[0.9359917916309168, 0.06400820836908319]",0.0


In [27]:
df_r = df.select("probability", "income_50k_ind").toPandas()

In [28]:
df_r["probability"] = df_r["probability"].map(lambda x: x[1])

In [29]:
from sklearn.metrics import roc_auc_score

In [30]:
roc_auc_score(df_r.income_50k_ind, df_r["probability"])

0.9065337913332858

In [32]:
df_test = rf.transform(df_test)

In [33]:
df_test_r = df_test.select("probability", "income_50k_ind").toPandas()

In [34]:
df_test_r["probability"] = df_test_r["probability"].map(lambda x: x[1])

In [35]:
roc_auc_score(df_test_r.income_50k_ind, df_test_r["probability"])

0.9064371856534519

In [37]:
rf.featureImportances

SparseVector(38, {0: 0.0027, 1: 0.1256, 2: 0.0807, 5: 0.0002, 6: 0.0174, 7: 0.0002, 8: 0.267, 9: 0.1187, 10: 0.0436, 11: 0.0987, 12: 0.0001, 13: 0.0151, 14: 0.0346, 15: 0.0422, 18: 0.0717, 22: 0.0543, 24: 0.0, 25: 0.0113, 26: 0.0153, 27: 0.0, 28: 0.0, 29: 0.0002, 31: 0.0001, 35: 0.0002, 37: 0.0001})

In [38]:
df.limit(1).toPandas()

Unnamed: 0,det_ind_code,det_occ_code,num_emp,own_or_self,vet_benefits,year,age_q,wage_per_hour_q,capital_gains_q,capital_losses_q,stock_dividends_q,weeks_worked_q,class_worker_ind,education_ind,hs_college_ind,marital_stat_ind,major_ind_code_ind,major_occ_code_ind,race_ind,hisp_origin_ind,sex_ind,union_member_ind,unemp_reason_ind,full_or_part_emp_ind,tax_filer_stat_ind,region_prev_res_ind,state_prev_res_ind,det_hh_fam_stat_ind,det_hh_summ_ind,mig_chg_msa_ind,mig_chg_reg_ind,mig_move_reg_ind,mig_same_ind,mig_prev_sunbelt_ind,fam_under_18_ind,country_father_ind,country_mother_ind,country_self_ind,citizenship_ind,vet_question_ind,income_50k_ind,features,selectedFeatures,stdSelectedFeatures,rawPrediction,probability,prediction
0,37.0,38.0,2.0,0.0,2.0,95.0,16.0,1.0,1.0,1.0,2.0,5.0,1.0,5.0,0.0,2.0,7.0,9.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,"(37.0, 38.0, 2.0, 0.0, 2.0, 95.0, 16.0, 1.0, 1...","(37.0, 38.0, 2.0, 0.0, 2.0, 95.0, 16.0, 1.0, 2...","(2.047918095562956, 2.628992936581328, 0.84562...","[15.485695552035695, 4.514304447964303]","[0.7742847776017847, 0.22571522239821515]",0.0


In [41]:
df.groupBy("det_occ_code").agg(F.mean("income_50k_ind")).orderBy("avg(income_50k_ind)", ascending=False).show()

+------------+-------------------+
|det_occ_code|avg(income_50k_ind)|
+------------+-------------------+
|         7.0| 0.6839945280437757|
|        11.0| 0.6593406593406593|
|         4.0|0.49780058651026393|
|         5.0| 0.3707602339181287|
|         6.0| 0.3492063492063492|
|         2.0|0.32217907720420286|
|        18.0| 0.2742382271468144|
|        17.0| 0.2715979672501412|
|         9.0| 0.2696476964769648|
|         1.0|0.24448529411764705|
|        46.0| 0.2222222222222222|
|        15.0|0.21226993865030674|
|         3.0|                0.2|
|        16.0| 0.1753265602322206|
|         8.0| 0.1724779172477917|
|        28.0| 0.1559301625526791|
|        21.0|0.15196998123827393|
|        12.0|0.13682634730538923|
|        14.0| 0.1298283261802575|
|        45.0|0.11627906976744186|
+------------+-------------------+
only showing top 20 rows



In [43]:
df.groupBy("age_q").agg(F.mean("income_50k_ind")).orderBy("avg(income_50k_ind)", ascending=False).show()

+-----+--------------------+
|age_q| avg(income_50k_ind)|
+-----+--------------------+
| 14.0| 0.16619108530945587|
| 15.0|  0.1610560408283443|
| 13.0|  0.1463922896587653|
| 12.0| 0.12883304940374787|
| 16.0|  0.1263007549479698|
| 11.0| 0.11556318126766249|
| 10.0| 0.09407529915361416|
| 17.0| 0.07812328315569718|
|  9.0| 0.07493327858755902|
| 18.0| 0.04201301048066498|
|  8.0| 0.04024487019612289|
| 19.0|0.029037935080172078|
|  7.0| 0.01779745234438522|
|  6.0|0.004914004914004914|
|  5.0|2.713949701465532...|
|  0.0|                 0.0|
|  1.0|                 0.0|
|  2.0|                 0.0|
|  4.0|                 0.0|
|  3.0|                 0.0|
+-----+--------------------+



In [45]:
df.groupBy("weeks_worked_q").agg(F.mean("income_50k_ind")).orderBy("avg(income_50k_ind)", ascending=False).show()

+--------------+--------------------+
|weeks_worked_q| avg(income_50k_ind)|
+--------------+--------------------+
|           5.0| 0.14800750917313765|
|           4.0| 0.07566097084077192|
|           3.0|  0.0361714621256606|
|           2.0|0.014776808619805027|
|           1.0|0.006381949065392294|
+--------------+--------------------+



In [46]:
df.groupBy("class_worker_ind").agg(F.mean("income_50k_ind")).orderBy("avg(income_50k_ind)", ascending=False).show()

+----------------+--------------------+
|class_worker_ind| avg(income_50k_ind)|
+----------------+--------------------+
|             5.0|  0.3473200612557427|
|             6.0|  0.2041025641025641|
|             2.0|  0.1290704558910598|
|             4.0| 0.11473858528507215|
|             3.0| 0.10881294964028777|
|             1.0| 0.10165491197867496|
|             0.0|0.009017906129981546|
|             8.0|0.006060606060606061|
|             7.0|0.004555808656036446|
+----------------+--------------------+

