In [52]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Set seed
seed = 42

In [39]:
## Create Spark Session
spark = SparkSession.builder.appName('logRegCodeAlong').getOrCreate()

In [40]:
## Setup Schema
schema = StructType(fields=[StructField('passenger_id', IntegerType(), True),
                            StructField('survived', IntegerType(), True),
                            StructField('p_class', IntegerType(), True),
                            StructField('name', StringType(), True),
                            StructField('sex', StringType(), True),
                            StructField('age', DoubleType(), True),
                            StructField('sib_sp', IntegerType(), True),
                            StructField('parch', IntegerType(), True),
                            StructField('ticket', StringType(), True),
                            StructField('fare', DoubleType(), True),
                            StructField('cabin', StringType(), True),
                            StructField('embarked', StringType(), True)])

In [41]:
## Load Data
df = spark.read.csv('gs://spark-training-data/datasets/titanic.csv', header=True, inferSchema=False, schema=schema)
df.show(5)
df.printSchema() # Confirm proper schema

+------------+--------+-------+--------------------+------+----+------+-----+----------------+-------+-----+--------+
|passenger_id|survived|p_class|                name|   sex| age|sib_sp|parch|          ticket|   fare|cabin|embarked|
+------------+--------+-------+--------------------+------+----+------+-----+----------------+-------+-----+--------+
|           1|       0|      3|Braund, Mr. Owen ...|  male|22.0|     1|    0|       A/5 21171|   7.25| null|       S|
|           2|       1|      1|Cumings, Mrs. Joh...|female|38.0|     1|    0|        PC 17599|71.2833|  C85|       C|
|           3|       1|      3|Heikkinen, Miss. ...|female|26.0|     0|    0|STON/O2. 3101282|  7.925| null|       S|
|           4|       1|      1|Futrelle, Mrs. Ja...|female|35.0|     1|    0|          113803|   53.1| C123|       S|
|           5|       0|      3|Allen, Mr. Willia...|  male|35.0|     0|    0|          373450|   8.05| null|       S|
+------------+--------+-------+--------------------+----

In [42]:
## Subset data for modeling
df_sub = df.select(['survived', 'p_class', 'sex', 'age', 'sib_sp', 'parch', 'fare', 'embarked'])

In [43]:
## Drop cols with missing data - Total records drop from 891 to 712
df_sub_final = df_sub.na.drop()
df_sub_final.show(5)

+--------+-------+------+----+------+-----+-------+--------+
|survived|p_class|   sex| age|sib_sp|parch|   fare|embarked|
+--------+-------+------+----+------+-----+-------+--------+
|       0|      3|  male|22.0|     1|    0|   7.25|       S|
|       1|      1|female|38.0|     1|    0|71.2833|       C|
|       1|      3|female|26.0|     0|    0|  7.925|       S|
|       1|      1|female|35.0|     1|    0|   53.1|       S|
|       0|      3|  male|35.0|     0|    0|   8.05|       S|
+--------+-------+------+----+------+-----+-------+--------+
only showing top 5 rows



In [44]:
## Gender Indexer & Encoder
gender_indexer = StringIndexer(inputCol='sex', outputCol='sex_index')
gender_encoder = OneHotEncoder(inputCol='sex_index', outputCol='sex_vector')

## Embark Encoder
embark_indexer = StringIndexer(inputCol='embarked', outputCol='embarked_index')
embark_encoder = OneHotEncoder(inputCol='embarked_index', outputCol='embarked_vector')

In [45]:
## Build Assembler - Remember to use vector columns rather than sex / embarked or indexes
assembler = VectorAssembler(inputCols=['p_class', 'sex_vector', 'age', 'sib_sp',
                                       'parch', 'fare', 'embarked_vector'],
                           outputCol='features')

In [49]:
## Build Pipeline
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='survived')

pipeline = Pipeline(stages=[gender_indexer, embark_indexer,
                            gender_encoder, embark_encoder,
                            assembler, log_reg_titanic])

In [50]:
## Generate Train / Test data
train_data, test_data = df_sub_final.randomSplit([0.7,0.3], seed=seed)

In [51]:
## Run the model
fit_model = pipeline.fit(train_data)

21/11/23 21:30:24 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/11/23 21:30:24 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [54]:
## Create Binary Classifier
results = fit_model.transform(test_data)
results_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='survived')

In [57]:
## View Results
results.select(['features', 'survived', 'rawPrediction', 'probability', 'prediction']).show(5)

+--------------------+--------+--------------------+--------------------+----------+
|            features|survived|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(8,[0,2,5,7],[1.0...|       0|[-2.7549899175470...|[0.05980545621100...|       1.0|
|[1.0,1.0,21.0,0.0...|       0|[-0.8823600456852...|[0.29268895828188...|       1.0|
|[1.0,1.0,24.0,0.0...|       0|[-1.3741730199859...|[0.20194646889313...|       1.0|
|[1.0,1.0,24.0,0.0...|       0|[-1.5735624950255...|[0.17170912279073...|       1.0|
|[1.0,1.0,29.0,0.0...|       0|[-0.4549423123065...|[0.38818633160459...|       1.0|
+--------------------+--------+--------------------+--------------------+----------+
only showing top 5 rows



In [68]:
## Evaluate area under the curve
AUC = results_eval.evaluate(results)
print(f'AUC: {AUC}')

AUC: 0.7747561675272518


In [73]:
## Show model coefficients
fit_model.stages[-1].coefficients

DenseVector([-1.3867, -2.4384, -0.0393, -0.5029, 0.0793, 0.0007, 0.43, 1.1177])