# very basic ML pipeline
I think producing a random forest classifier which predicts survival for the titanic dataset would be a good exercise

In [None]:
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.sql import Row, SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 5)

In [None]:
df = spark.read.csv("../data/titanic.csv", header=True, inferSchema=True)

### create a pipeline a pipeline to numerically encode all the useful data

In [None]:
pipeline = Pipeline(
    stages=[
        OneHotEncoder(inputCols=["Pclass"], outputCols=["Pclass_ohe"]),
        StringIndexer(inputCol="Sex", outputCol="Sex_index"),
        OneHotEncoder(inputCols=["Sex_index"], outputCols=["Sex_ohe"]),
        VectorAssembler(
            inputCols=[
                "Pclass_ohe",
                "Sex_ohe",
                "Age",
                "Siblings/Spouses Aboard",
                "Parents/Children Aboard",
                "Fare",
            ],
            outputCol="features",
        ),
    ]
).fit(df)

In [None]:
df = pipeline.transform(df).select(["features", "Survived"])

### stratified train/test split
it's important that we get an equal proportion of the predicted class in the train and test datasets

In [None]:
df.groupBy("Survived").count().show()

In [None]:
df_train = df.sampleBy("Survived", fractions={0: 0.8, 1: 0.8}, seed=10)
df_test = df.subtract(df_train)

In [None]:
df_train.count(), df_test.count()

In [None]:
df_train.groupBy("Survived").count().show()

In [None]:
df_test.groupBy("Survived").count().show()

## Random forest

In [None]:
classifier = RandomForestClassifier(labelCol="Survived", featuresCol="features").fit(df_train)

In [None]:
predictions = classifier.transform(df_test)

In [None]:
predictions

### assess precision and accuracy

In [None]:
evaluator = (
    MulticlassClassificationEvaluator()
    .setLabelCol("Survived")
    .setPredictionCol("prediction")
)

In [None]:
evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

In [None]:
evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})

## Try gradient boosting instead

In [None]:
classifier = GBTClassifier(labelCol="Survived",featuresCol="features", maxIter=10).fit(df_train)

In [None]:
predictions = classifier.transform(df_test)

In [None]:
predictions

### assess precision and accuracy

In [None]:
evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

In [None]:
evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})

# wrapping up

Note that this model could be _muuuuuuch_ better if I spent more time on the data preprocessing steps - this is an incredibly basic approach, but sophistication isn't the point of this project.