In [0]:
#all spark imports
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

#instantiate the spark session
spark = SparkSession.builder.appName("Titanic-Survival-Prediction").getOrCreate()

#set the shuffle partition same as number of cpu cores to improve performance 
spark.conf.set("spark.sql.shuffle.partitions", 4)

In [0]:
from pyspark.ml.feature import Imputer, StringIndexer, VectorAssembler
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Spark Data Frame

Abstraction over RDD (simplified and optimized)

- distributed collection
- structured (like database tables)
- schema

In [0]:
# File location and type
path = "/FileStore/tables/Titanic_train.csv"

df = spark.read \
  .format("csv") \
  .option("inferSchema", True) \
  .option("header", True) \
  .option("sep", ',') \
  .option("path", path) \
  .load()
#   .option('nanValue', ' ')\
#   .option('nullValue', ' ')\

In [0]:
df.show()

In [0]:
display(df)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [0]:
df.rdd.getNumPartitions()

In [0]:
df.printSchema()

In [0]:
# check the count of null values for each column
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [0]:
# create filter
fltr = df['Embarked'].isNull()

# filter the data
df.where(fltr).show()

In [0]:
# python function that imputes 'S' if null
def impute_embark(e):
  if e is None:
    return 'S'
  else:
    return e
  
# register the python function as UDF
impute_embark_udf = udf(impute_embark, StringType())

# replace the old column with new column
df = df.withColumn("Embarked", impute_embark_udf("Embarked"))

In [0]:
# filter the data
df.where(fltr).show()

In [0]:
# create age imputer
age_imputer = Imputer(strategy = 'mean', inputCols=["Age"], outputCols=["Age_Imputed"])

# fit the data
age_imputer_model = age_imputer.fit(df)

# transfrom df to get df with Age_Imputed column
df = age_imputer_model.transform(df)

In [0]:
df.show(10)

In [0]:
num_cols = [ 'Age_Imputed',  'Fare']
cat_cols = ['Pclass', 'SibSp', 'Sex', 'Parch', 'Embarked']
label_col = 'Survived'

In [0]:
input_cols = num_cols

In [0]:
stages = []
for col in cat_cols:
  string_indexer = StringIndexer(inputCol=col, outputCol=col + "Index")
  stages += [string_indexer]
  input_cols.append(col + "Index")

In [0]:
stages

In [0]:
vect_assembler = VectorAssembler(inputCols= input_cols, outputCol="features")
stages += [vect_assembler]

In [0]:
pipeline = Pipeline().setStages(stages)
pipeline_model = pipeline.fit(df)
train_df = pipeline_model.transform(df)

In [0]:
display(train_df)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Imputed,PclassIndex,SibSpIndex,SexIndex,ParchIndex,EmbarkedIndex,features
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,0.0,1.0,0.0,0.0,0.0,"List(0, 7, List(0, 1, 3), List(22.0, 7.25, 1.0))"
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,1.0,1.0,1.0,0.0,1.0,"List(1, 7, List(), List(38.0, 71.2833, 1.0, 1.0, 1.0, 0.0, 1.0))"
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,0.0,0.0,1.0,0.0,0.0,"List(0, 7, List(0, 1, 4), List(26.0, 7.925, 1.0))"
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,1.0,1.0,1.0,0.0,0.0,"List(1, 7, List(), List(35.0, 53.1, 1.0, 1.0, 1.0, 0.0, 0.0))"
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,0.0,0.0,0.0,0.0,0.0,"List(0, 7, List(0, 1), List(35.0, 8.05))"
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,29.69911764705882,0.0,0.0,0.0,0.0,2.0,"List(0, 7, List(0, 1, 6), List(29.69911764705882, 8.4583, 2.0))"
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,54.0,1.0,0.0,0.0,0.0,0.0,"List(0, 7, List(0, 1, 2), List(54.0, 51.8625, 1.0))"
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,2.0,0.0,4.0,0.0,1.0,0.0,"List(1, 7, List(), List(2.0, 21.075, 0.0, 4.0, 0.0, 1.0, 0.0))"
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,27.0,0.0,0.0,1.0,2.0,0.0,"List(1, 7, List(), List(27.0, 11.1333, 0.0, 0.0, 1.0, 2.0, 0.0))"
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,14.0,2.0,1.0,1.0,0.0,1.0,"List(1, 7, List(), List(14.0, 30.0708, 2.0, 1.0, 1.0, 0.0, 1.0))"


In [0]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol= label_col, featuresCol="features", maxIter=10)

# Train model with Training Data
lr_model = lr.fit(train_df)

In [0]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [0]:
train_pred = lr_model.transform(train_df)

In [0]:
display(train_pred)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Imputed,PclassIndex,SibSpIndex,SexIndex,ParchIndex,EmbarkedIndex,features,rawPrediction,probability,prediction
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,0.0,1.0,0.0,0.0,0.0,"List(0, 7, List(0, 1, 3), List(22.0, 7.25, 1.0))","List(1, 2, List(), List(2.2276330103688022, -2.2276330103688022))","List(1, 2, List(), List(0.9027036649564477, 0.09729633504355223))",0.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,1.0,1.0,1.0,0.0,1.0,"List(1, 7, List(), List(38.0, 71.2833, 1.0, 1.0, 1.0, 0.0, 1.0))","List(1, 2, List(), List(-1.618928418217814, 1.618928418217814))","List(1, 2, List(), List(0.16535270743823674, 0.8346472925617633))",1.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,0.0,0.0,1.0,0.0,0.0,"List(0, 7, List(0, 1, 4), List(26.0, 7.925, 1.0))","List(1, 2, List(), List(-0.6994031768221536, 0.6994031768221536))","List(1, 2, List(), List(0.3319445644932511, 0.6680554355067488))",1.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,1.0,1.0,1.0,0.0,0.0,"List(1, 7, List(), List(35.0, 53.1, 1.0, 1.0, 1.0, 0.0, 0.0))","List(1, 2, List(), List(-1.2344368998016244, 1.2344368998016244))","List(1, 2, List(), List(0.22540580806797106, 0.7745941919320289))",1.0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,0.0,0.0,0.0,0.0,0.0,"List(0, 7, List(0, 1), List(35.0, 8.05))","List(1, 2, List(), List(2.122536046526798, -2.122536046526798))","List(1, 2, List(), List(0.8930743446103381, 0.10692565538966203))",0.0
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,29.69911764705882,0.0,0.0,0.0,0.0,2.0,"List(0, 7, List(0, 1, 6), List(29.69911764705882, 8.4583, 2.0))","List(1, 2, List(), List(1.5566971535352891, -1.5566971535352891))","List(1, 2, List(), List(0.825878905044386, 0.17412109495561404))",0.0
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,54.0,1.0,0.0,0.0,0.0,0.0,"List(0, 7, List(0, 1, 2), List(54.0, 51.8625, 1.0))","List(1, 2, List(), List(1.4410154712362575, -1.4410154712362575))","List(1, 2, List(), List(0.808611853722611, 0.19138814627738895))",0.0
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,2.0,0.0,4.0,0.0,1.0,0.0,"List(1, 7, List(), List(2.0, 21.075, 0.0, 4.0, 0.0, 1.0, 0.0))","List(1, 2, List(), List(2.98263614804127, -2.98263614804127))","List(1, 2, List(), List(0.951783492712763, 0.04821650728723698))",0.0
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,27.0,0.0,0.0,1.0,2.0,0.0,"List(1, 7, List(), List(27.0, 11.1333, 0.0, 0.0, 1.0, 2.0, 0.0))","List(1, 2, List(), List(-0.29726389121189634, 0.29726389121189634))","List(1, 2, List(), List(0.4262264835698094, 0.5737735164301906))",1.0
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,14.0,2.0,1.0,1.0,0.0,1.0,"List(1, 7, List(), List(14.0, 30.0708, 2.0, 1.0, 1.0, 0.0, 1.0))","List(1, 2, List(), List(-2.1961834056264036, 2.1961834056264036))","List(1, 2, List(), List(0.10009374448705803, 0.899906255512942))",1.0


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator_LR = BinaryClassificationEvaluator(rawPredictionCol="prediction",  labelCol='Survived',)
area_under_curve = evaluator_LR.evaluate(train_pred)

#default evaluation is areaUnderROC
print("areaUnderROC = %g" % area_under_curve)

evaluator_LR.getMetricName()

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [5, 10, 20])
             .build())

In [0]:
# cross validator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator_LR, numFolds=5)

# Run cross validations
cv_model = cv.fit(train_df)

In [0]:
# use best model to predict
train_pred = cv_model.bestModel.transform(train_df)

# calculatate the AUC
area_under_curve = evaluator_LR.evaluate(train_pred)

#default evaluation is areaUnderROC
print("areaUnderROC = %g" % area_under_curve)