In [1]:
def shape(data):
    num_rows = data.count()
    num_columns = len(data.columns)
    print(f"Shape: ({num_rows}, {num_columns})")

In [2]:
import warnings
warnings.filterwarnings("ignore") # Ignores all warnings

In [3]:
import findspark
findspark.init()

import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('titanic').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/07 14:31:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
df = spark.read.csv('titanic.csv', inferSchema=True, header=True)

In [7]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [8]:
shape(df)

Shape: (891, 12)


In [9]:
df.show(3)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 3 rows



In [11]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [21]:
df_new = df.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [13]:
# ### Check NaN, Null
from pyspark.sql.functions import count, when, isnan, col

In [24]:
df_new.select([count(when(isnan(c), c)).alias(c) for c in df_new.columns]).toPandas().T

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,0
SibSp,0
Parch,0
Fare,0
Embarked,0


In [25]:
df_new.select([count(when(col(c).isNull(), c)).alias(c) for c in df_new.columns]).toPandas().T

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [27]:
final_df = df_new.na.drop()

In [28]:
final_df.select([count(when(col(c).isNull(), c)).alias(c) for c in final_df.columns]).toPandas().T

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,0
SibSp,0
Parch,0
Fare,0
Embarked,0


### Working with categorical columns

In [29]:
from pyspark.ml.feature import (OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer)

In [30]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="SetIndex")
gender_encoder = OneHotEncoder(inputCol="SetIndex", outputCol="SexVec")

In [31]:
embark_indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol="EmbarkIndex", outputCol="EmbarkVec")

In [None]:
from pyspark.ml.linalg import Vectors

In [32]:
df_new.columns

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [33]:
assembler = VectorAssembler(
    inputCols=['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkVec'],
    outputCol="features")

In [34]:
from pyspark.ml.classification import LogisticRegression

### Pipelines

In [35]:
from pyspark.ml import Pipeline

In [36]:
log_reg_titanic = LogisticRegression(featuresCol='features',
                                     labelCol='Survived')

In [41]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer,
                            gender_encoder, embark_encoder,
                            assembler,
                            log_reg_titanic])

In [42]:
train_titanic, valid_titanic = final_df.randomSplit([.7, .3])

In [43]:
fit_model = pipeline.fit(train_titanic)

25/07/07 15:09:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/07/07 15:09:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [44]:
results = fit_model.transform(valid_titanic)

In [46]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [47]:
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                     labelCol='Survived')

In [49]:
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [50]:
AUC = eval.evaluate(results)
AUC

0.7483146067415731