In [1]:
import numpy as np # for linear algebra
import pandas as pd #data processing, csv file

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import os

In [2]:

spark = SparkSession.builder.getOrCreate()
spark

#### 1. Load the data

In [4]:
sdf_train = spark.read.csv("datas/train.csv" , inferSchema = True, header = True)

In [6]:
sdf_train.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [7]:
sdf_train.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [9]:
sdf_train.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [14]:
sdf_train.limit(2)

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

In [15]:
pdf = sdf_train.limit(5).toPandas()
pdf.T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [16]:
# test data set

In [17]:
sdf_test = spark.read.csv("datas/test.csv", inferSchema=True , header= True)

In [19]:
sdf_test.show(5)

+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch| Ticket|   Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0| 330911| 7.8292| null|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0| 363272|    7.0| null|       S|
|        894|     2|Myles, Mr. Thomas...|  male|62.0|    0|    0| 240276| 9.6875| null|       Q|
|        895|     3|    Wirz, Mr. Albert|  male|27.0|    0|    0| 315154| 8.6625| null|       S|
|        896|     3|Hirvonen, Mrs. Al...|female|22.0|    1|    1|3101298|12.2875| null|       S|
+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+
only showing top 5 rows



In [22]:
pdf = sdf_test.limit(10).toPandas()
pdf

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


### 2. Data Cleanup

In [24]:
sdf_typecast = sdf_train.withColumn("Ticket" , sdf_train["Ticket"].cast("double"))
sdf_typecast = sdf_typecast.fillna(0) # Fill na 0

### 3. Feature engineering

In [25]:
numeric_cols = ["PassengerID" , "Survived", "Pclass", "Age" , "SibSp", "parch" , "Ticket" , "Fare"]
numeric_features = ["Pclass" , "Age" , "SibSp" , "Parch" , "Fare"]
sdf_train_subset = sdf_typecast  # select(numeric_cols)

In [26]:
_stages = []

In [27]:
from pyspark.ml.feature import VectorAssembler
assemblerInput = numeric_features
# [f + "_vect" for f in string_features]
print(assemblerInput)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [28]:
vectAssembler = VectorAssembler(inputCols = assemblerInput, outputCol="vect_features")
#.fit(sdf_train_subset)
_stages +=[vectAssembler]

In [29]:
_stages

[VectorAssembler_b83ca0a9833f]

#### 4. ML Model

In [30]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol= "survived", featuresCol= "vect_features")
# maxDepth = 1
# _stage +=[dt]

In [31]:
dt

DecisionTreeClassifier_c2b4d2768e1a

In [32]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol= "Survived" , featuresCol= "vect_features",\
                           numTrees= 100 , maxDepth= 4)
_stages +=[rf]

In [33]:
_stages

[VectorAssembler_b83ca0a9833f, RandomForestClassifier_5e29cc9e77d8]

In [34]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = _stages)

In [35]:
model = pipeline.fit(sdf_train_subset)
numeric_cols_test = ["PassengerID" , "Pclass" , "Age" , "SibSp" , "Parch", "Ticket", "Fare"]
sdf_test_subset = sdf_test.withColumn("Ticket" , sdf_test["Ticket"].cast("double")).\
fillna(0).select(numeric_cols_test)

In [None]:
sdf_predict = model.transform(sdf_test_subset)