# ML with PySpark

The primary machine learning API is contained in `pyspark.ml`. It is based on PySpark DataFrames. The well-known RDD-based API `pyspark.mllib` is mainly provided for legacy purposes and might become deprecated at some point.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('..')

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer,
    OneHotEncoder,
    VectorAssembler
)
from pyspark.ml.classification import LogisticRegression

from utils import download_textfile

## Start session

In [None]:
# start session
spark = SparkSession.builder \
    .appName('Logistic Regression') \
    .getOrCreate()

## Import data

In [None]:
# download CSV file
url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'

download_textfile(url, save_path='../data/titanic.csv')

In [None]:
# import CSV file
df = spark.read.csv('../data/titanic.csv', header=True, inferSchema=True)

print(f'The data has {df.count()} rows and {len(df.columns)} columns')

In [None]:
# show summary
df.show(10)
df.printSchema()

In [None]:
# drop NaNs
df = df.na.drop() # note that there are no NaNs

In [None]:
# select columns
columns = [
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'Siblings/Spouses Aboard',
    'Parents/Children Aboard',
    'Fare'
]

df = df.select(columns) # df = df.drop('Name')

## Create pipeline

In [None]:
# create transformations
sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIdx')
sex_encoder = OneHotEncoder(inputCol='SexIdx', outputCol='SexVec')

assembler = VectorAssembler(
    inputCols=[
        'Survived',
        'Pclass',
        'SexVec',
        'Age',
        'Siblings/Spouses Aboard',
        'Parents/Children Aboard',
        'Fare'
    ],
    outputCol='Features'
)

In [None]:
# create estimator
lr = LogisticRegression(
    featuresCol='Features',
    labelCol='Survived',
    maxIter=10,
    regParam=0.3,
    elasticNetParam=0.8
)

In [None]:
# create pipeline
stages = [
    sex_indexer,
    sex_encoder,
    assembler,
    lr
]

pipeline = Pipeline(stages=stages)

## Fit model

In [None]:
# split data
train_df, test_df = df.randomSplit([0.8, 0.2])

In [None]:
# fit model
model = pipeline.fit(train_df)

In [None]:
# apply to test set
results = model.transform(test_df)

results.show()

## Stop session

In [None]:
# stop session
spark.stop()