# Spark Exercise

Instruction
Create a Logistic regression and a Random Forest Classifier to predict whether passengers would survive the demise of the Titanic.

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SQLContext
from pyspark import SparkFiles

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# Start Spark Session
session = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/24 00:37:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = session.read.csv('titanic_dataset.csv',header = 'True',inferSchema='True')
df.show(5)

                                                                                

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [4]:
df.describe().show()

                                                                                

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

## Data Preparation

### Check for Null Values

In [5]:
# This function use to print feature with null values and null count 
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [6]:
null_columns_count_list = null_value_count(df)

In [7]:
session.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()

+----------------------+-----------------+
|Column_With_Null_Value|Null_Values_Count|
+----------------------+-----------------+
|                   Age|              177|
|                 Cabin|              687|
|              Embarked|                2|
+----------------------+-----------------+



### Fill in Missing Age

In [8]:
df = df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Initial|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|     Mr|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|    Mrs|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|   Miss|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|    Mrs|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|     Mr|
+-----------+---

In [9]:
df.select("Initial").distinct().show()

+--------+
| Initial|
+--------+
|     Don|
|    Miss|
|Countess|
|     Col|
|     Rev|
|    Lady|
|  Master|
|     Mme|
|    Capt|
|      Mr|
|      Dr|
|     Mrs|
|     Sir|
|Jonkheer|
|    Mlle|
|   Major|
|      Ms|
+--------+



In [10]:
df = df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

In [11]:
df.groupby('Initial').avg('Age').collect()

[Row(Initial='Miss', avg(Age)=21.86),
 Row(Initial='Other', avg(Age)=45.888888888888886),
 Row(Initial='Master', avg(Age)=4.574166666666667),
 Row(Initial='Mr', avg(Age)=32.73960880195599),
 Row(Initial='Mrs', avg(Age)=35.981818181818184)]

In [12]:
df = df.withColumn("Age",when((df["Initial"] == "Miss") & (df["Age"].isNull()), 22).otherwise(df["Age"]))
df = df.withColumn("Age",when((df["Initial"] == "Other") & (df["Age"].isNull()), 46).otherwise(df["Age"]))
df = df.withColumn("Age",when((df["Initial"] == "Master") & (df["Age"].isNull()), 5).otherwise(df["Age"]))
df = df.withColumn("Age",when((df["Initial"] == "Mr") & (df["Age"].isNull()), 33).otherwise(df["Age"]))
df = df.withColumn("Age",when((df["Initial"] == "Mrs") & (df["Age"].isNull()), 36).otherwise(df["Age"]))


### Fill in Missing Embark Values

In [13]:
df.groupBy("Embarked").count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    null|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [14]:
df = df.na.fill({"Embarked" : 'S'})

### Drop Cabin Column

In [16]:
df = df.drop("Cabin")

In [17]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = false)
 |-- Initial: string (nullable = true)



### Encode Categorical Variables

In [18]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_encoded").fit(df) for column in ["Sex"]]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

In [19]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = false)
 |-- Initial: string (nullable = true)
 |-- Sex_encoded: double (nullable = false)



## Model Preprocessing

In [20]:
# Drop unneeded and irrelevant columns
df = df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Sex","Initial", "Fare")

In [21]:
# Convert features to vector
feature = VectorAssembler(inputCols=df.columns[1:],outputCol="features")
feature_vector= feature.transform(df)

In [22]:
# Create Train and Test datasets
(train, test) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

## Logistic Regression Model

In [23]:
from pyspark.ml.classification import LogisticRegression
logreg = LogisticRegression(labelCol="Survived", featuresCol="features")

logreg_model = logreg.fit(train)
logreg_prediction = logreg_model.transform(test)
logreg_prediction.select("prediction", "Survived", "features").show()
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|(5,[0,1],[1.0,22.0])|
|       1.0|       0|(5,[0,1],[1.0,24.0])|
|       1.0|       0|(5,[0,1],[1.0,29.0])|
|       0.0|       0|[1.0,29.0,1.0,0.0...|
|       1.0|       0|(5,[0,1],[1.0,30.0])|
|       0.0|       0|[1.0,31.0,1.0,0.0...|
|       1.0|       0|(5,[0,1],[1.0,33.0])|
|       1.0|       0|(5,[0,1],[1.0,33.0])|
|       1.0|       0|(5,[0,1],[1.0,33.0])|
|       1.0|       0|(5,[0,1],[1.0,33.0])|
|       1.0|       0|(5,[0,1],[1.0,33.0])|
|       0.0|       0|[1.0,37.0,1.0,0.0...|
|       0.0|       0|[1.0,38.0,0.0,1.0...|
|       0.0|       0|[1.0,45.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,47.0])|
|       0.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(5,[0,1],[1.0,62.0])|
|       0.0|       0|(5,[0,1],[1.0,65.0])|
|       0.0|       0|(5,[0,1],[1.0,71.0])|
|       0.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [24]:
# Evaluate model accuracy
logreg_accuracy = evaluator.evaluate(logreg_prediction)
print("Accuracy of LogisticRegression is = %g"% (logreg_accuracy))

Accuracy of LogisticRegression is = 0.718085


### Random Forest Model

In [25]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="Survived", featuresCol="features")
rf_model = rf.fit(train)
rf_prediction = rf_model.transform(test)
rf_prediction.select("prediction", "Survived", "features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|(5,[0,1],[1.0,22.0])|
|       0.0|       0|(5,[0,1],[1.0,24.0])|
|       0.0|       0|(5,[0,1],[1.0,29.0])|
|       0.0|       0|[1.0,29.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,30.0])|
|       0.0|       0|[1.0,31.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,33.0])|
|       0.0|       0|(5,[0,1],[1.0,33.0])|
|       0.0|       0|(5,[0,1],[1.0,33.0])|
|       0.0|       0|(5,[0,1],[1.0,33.0])|
|       0.0|       0|(5,[0,1],[1.0,33.0])|
|       0.0|       0|[1.0,37.0,1.0,0.0...|
|       0.0|       0|[1.0,38.0,0.0,1.0...|
|       0.0|       0|[1.0,45.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,47.0])|
|       0.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|(5,[0,1],[1.0,62.0])|
|       0.0|       0|(5,[0,1],[1.0,65.0])|
|       0.0|       0|(5,[0,1],[1.0,71.0])|
|       0.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [26]:
# Evaluate Model Accuracy
rf_accuracy = evaluator.evaluate(rf_prediction)
print("Accuracy of RandomForestClassifier is = %g"% (rf_accuracy))

Accuracy of RandomForestClassifier is = 0.781915
