In [45]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local") \
        .appName("Spark ML Quiz1") \
        .config("spark.ui.port", "4050") \
        .getOrCreate()

sc = spark.sparkContext

In [46]:
from pyspark.sql.types import *

schema = StructType([
    StructField('age', IntegerType(), True),
    StructField('workclass', StringType(), True),
    StructField('fnlwgt', IntegerType(), True),
    StructField('education', StringType(), True),
    StructField('educational-num', IntegerType(), True),
    StructField('marital-status', StringType(), True),
    StructField('occupation', StringType(), True),
    StructField('relationship', StringType(), True),
    StructField('race', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('capital-gain', IntegerType(), True),
    StructField('capital-loss', IntegerType(), True),
    StructField('hours-per-week', IntegerType(), True),
    StructField('native-country', StringType(), True),
    StructField('income', StringType(), True),
])

data = spark.read.format('csv').option('header', 'true').schema(schema).load('../../data/input5/income_data.csv')
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [47]:
data.select('age').describe().show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|             32561|
|   mean| 38.58164675532078|
| stddev|13.640432553581356|
|    min|                17|
|    max|                90|
+-------+------------------+



In [48]:
prep_cols = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data.rdd.map(lambda row: 1 if sum([row[c] == ' ?' for c in prep_cols]) > 0 else 0)\
        .reduce(lambda x, y: x + y)

                                                                                

2399

In [49]:
import pyspark.sql.functions as fn

for c in prep_cols:
    data = data.withColumn(c, fn.when(fn.col(c) == ' ?', None).otherwise(fn.col(c)))

data.show(5)

+---+-----------------+------+----------+---------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt| education|educational-num|     marital-status|        occupation|  relationship|  race| gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+-----------------+------+----------+---------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516| Bachelors|             13|      Never-married|      Adm-clerical| Not-in-family| White|   Male|        2174|           0|            40| United-States| <=50K|
| 50| Self-emp-not-inc| 83311| Bachelors|             13| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|           0|           0|            13| United-States| <=50K|
| 38|          Private|215646|   HS-grad|         

In [50]:
data.select(*[(fn.count(c) / fn.count('*')).alias(c + '_missing') for c in prep_cols]).show()

+-----------+-----------------+--------------+-----------------+-----------------------+----------------------+------------------+--------------------+------------+--------------+--------------------+--------------------+----------------------+----------------------+--------------+
|age_missing|workclass_missing|fnlwgt_missing|education_missing|educational-num_missing|marital-status_missing|occupation_missing|relationship_missing|race_missing|gender_missing|capital-gain_missing|capital-loss_missing|hours-per-week_missing|native-country_missing|income_missing|
+-----------+-----------------+--------------+-----------------+-----------------------+----------------------+------------------+--------------------+------------+--------------+--------------------+--------------------+----------------------+----------------------+--------------+
|        1.0|0.943613525383127|           1.0|              1.0|                    1.0|                   1.0|0.9433985442707533|                 1.0|

In [51]:
data = data.dropna()
data.count()

30162

In [52]:
from pyspark.ml.feature import StringIndexer

string_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
string_cols_numeric = ['workclass_numeric', 'education_numeric', 'marital-status_numeric', 'occupation_numeric', 'relationship_numeric', 'race_numeric', 'gender_numeric', 'native-country_numeric', 'income_numeric']

indexer = StringIndexer(inputCols=string_cols, outputCols=string_cols_numeric)
indexer_fitted = indexer.fit(data)
data_indexed = indexer_fitted.transform(data)
data_indexed.select(*string_cols_numeric).show(5)

+-----------------+-----------------+----------------------+------------------+--------------------+------------+--------------+----------------------+--------------+
|workclass_numeric|education_numeric|marital-status_numeric|occupation_numeric|relationship_numeric|race_numeric|gender_numeric|native-country_numeric|income_numeric|
+-----------------+-----------------+----------------------+------------------+--------------------+------------+--------------+----------------------+--------------+
|              3.0|              2.0|                   1.0|               3.0|                 1.0|         0.0|           0.0|                   0.0|           0.0|
|              1.0|              2.0|                   0.0|               2.0|                 0.0|         0.0|           0.0|                   0.0|           0.0|
|              0.0|              0.0|                   2.0|               8.0|                 1.0|         0.0|           0.0|                   0.0|           0.0

In [53]:
from pyspark.ml.feature import OneHotEncoder

string_cols_onehot = ['workclass_onehot', 'education_onehot', 'marital-status_onehot', 'occupation_onehot', 'relationship_onehot', 'race_onehot', 'gender_onehot', 'native-country_onehot']
string_cols_numeric.remove('income_numeric')

encoder = OneHotEncoder(inputCols=string_cols_numeric, outputCols=string_cols_onehot)
data_onehot = encoder.fit(data_indexed).transform(data_indexed)
data_onehot.show(5)

+---+-----------------+------+----------+---------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+-----------------+-----------------+----------------------+------------------+--------------------+------------+--------------+----------------------+--------------+----------------+----------------+---------------------+-----------------+-------------------+-------------+-------------+---------------------+
|age|        workclass|fnlwgt| education|educational-num|     marital-status|        occupation|  relationship|  race| gender|capital-gain|capital-loss|hours-per-week|native-country|income|workclass_numeric|education_numeric|marital-status_numeric|occupation_numeric|relationship_numeric|race_numeric|gender_numeric|native-country_numeric|income_numeric|workclass_onehot|education_onehot|marital-status_onehot|occupation_onehot|relationship_onehot|  race_onehot|gender_onehot|native-country_onehot|
+-

In [54]:
int_cols = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

data_onehot = data_onehot.select(*(string_cols_onehot + int_cols + ['income_numeric']))
data_onehot.columns

['workclass_onehot',
 'education_onehot',
 'marital-status_onehot',
 'occupation_onehot',
 'relationship_onehot',
 'race_onehot',
 'gender_onehot',
 'native-country_onehot',
 'age',
 'fnlwgt',
 'educational-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'income_numeric']

In [55]:
cols = data_onehot.columns
cols.remove('income_numeric')

In [68]:
cols

['workclass_onehot',
 'education_onehot',
 'marital-status_onehot',
 'occupation_onehot',
 'relationship_onehot',
 'race_onehot',
 'gender_onehot',
 'native-country_onehot',
 'age',
 'fnlwgt',
 'educational-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [56]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=cols, outputCol='features')
data = assembler.transform(data_onehot)
data.select('features').show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------+
|features                                                                                                   |
+-----------------------------------------------------------------------------------------------------------+
|(96,[3,8,22,30,41,45,49,50,90,91,92,93,95],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,39.0,77516.0,13.0,2174.0,40.0])|
|(96,[1,8,21,29,40,45,49,50,90,91,92,95],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,83311.0,13.0,13.0])          |
|(96,[0,6,23,35,41,45,49,50,90,91,92,95],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,38.0,215646.0,9.0,40.0])          |
|(96,[0,11,21,35,40,46,49,50,90,91,92,95],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,53.0,234721.0,7.0,40.0])         |
|(96,[0,8,21,27,44,46,58,90,91,92,95],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,28.0,338409.0,13.0,40.0])                |
+-----------------------------------------------------------------------------------------------------------+
only showi

In [63]:
from pyspark.ml.feature import StandardScaler

standard_scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=False)
data_scaled = standard_scaler.fit(data).transform(data)
data_scaled.select('features', 'scaled_features').show(5)

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|(96,[3,8,22,30,41...|(96,[3,8,22,30,41...|
|(96,[1,8,21,29,40...|(96,[1,8,21,29,40...|
|(96,[0,6,23,35,41...|(96,[0,6,23,35,41...|
|(96,[0,11,21,35,4...|(96,[0,11,21,35,4...|
|(96,[0,8,21,27,44...|(96,[0,8,21,27,44...|
+--------------------+--------------------+
only showing top 5 rows



In [64]:
train, test = data_scaled.randomSplit([0.8, 0.2], seed=37)
train.count(), test.count()

                                                                                

(24218, 5944)

In [65]:
from pyspark.ml.classification import LinearSVC

lr = LinearSVC(labelCol='income_numeric', featuresCol='features', maxIter=100)
model = lr.fit(train)
predict_train = model.transform(train)
predict_test = model.transform(test)
predict_test.select('income_numeric', 'prediction').show()

[Stage 606:>                                                        (0 + 1) / 1]

+--------------+----------+
|income_numeric|prediction|
+--------------+----------+
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           1.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           1.0|       0.0|
|           0.0|       1.0|
+--------------+----------+
only showing top 20 rows



                                                                                

In [66]:
predict_test.rdd.map(lambda row: 1 if row['income_numeric'] == row['prediction'] else 0).reduce(lambda x, y: x + y)

                                                                                

4975

In [67]:
predict_test.rdd.map(lambda row: 1 if row['income_numeric'] == row['prediction'] else 0).reduce(lambda x, y: x + y) / predict_test.count()

                                                                                

0.8369784656796769

In [None]:
sc.stop()