In [1]:
import os
import sys
sys.path.append('/usr/spark-2.1.1/python') # spark path
import pyspark
from pyspark.sql import Row, DataFrameReader
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import *

In [2]:
conf = pyspark.SparkConf()

conf.setMaster("spark://master:7077")
conf.setAppName("Titanic")

sc = pyspark.SparkContext(conf=conf)
TRAIN_DATA_PATH = '/root/workspace/kkchuchu.mushroom/data/titanic/train.csv'
TEST_DATA_PATH = '/root/workspace/kkchuchu.mushroom/data/titanic/test.csv'

In [3]:
spark = SparkSession(sc)

In [90]:
# read data
# the resource should be accessible from all workers as well.
passengers = spark.read.csv(TRAIN_DATA_PATH, header=True)

|Variable Name | Description|
|-------------|-------------|
|Survived|Survived (1) or died (0)|
|Pclass|	Passenger’s class|
|Name|	Passenger’s name|
|Sex|	Passenger’s sex|
|Age|	Passenger’s age|
|SibSp|	Number of siblings/spouses aboard|
|Parch|	Number of parents/children aboard|
|Ticket|	Ticket number|
|Fare|	Fare|
|Cabin|	Cabin|
|Embarked|	Port of embarkation|

### Feature Details

- Pclass is the Ticket-class: first (1), second (2), and third (3) class tickets were used. This is an ordinal integer feature.
- Name is the name of the passenger. The names also contain titles and some persons might share the same surname; indicating family relations. We know that some titles can indicate a certain age group. For instance Master is a boy while Mr is a man. This feature is a character string of variable length but similar format.
- Sex is an indicator whether the passenger was female or male. This is a categorical text string feature.
- Age is the integer age of the passenger. There are NaN values in this column.
- SibSp is another ordinal integer feature describing the number of siblings or spouses travelling with each passenger.
- Parch is another ordinal integer features that gives the number of parents or children travelling with each passenger.
- Ticket is a character string of variable length that gives the ticket number.
- Fare is a float feature showing how much each passenger paid for their rather memorable journey.
- Cabin gives the cabin number of each passenger. There are NaN in this column. This is another string feature.
Embarked shows the port of embarkation as a categorical character value.

In [8]:
passengers = passengers.withColumn('age', passengers['Age'].cast(DoubleType()))
passengers = passengers.withColumn('sibsp', passengers['SibSp'].cast(DoubleType()))
passengers = passengers.withColumn('parch', passengers['Parch'].cast(DoubleType()))
passengers = passengers.withColumn('fare', passengers['Fare'].cast(DoubleType()))

In [9]:
passengers.describe(['age', 'sibsp', 'parch', 'fare']).show()

+-------+------------------+------------------+-------------------+-----------------+
|summary|               age|             sibsp|              parch|             fare|
+-------+------------------+------------------+-------------------+-----------------+
|  count|               714|               891|                891|              891|
|   mean| 29.69911764705882|0.5230078563411896|0.38159371492704824| 32.2042079685746|
| stddev|14.526497332334035|1.1027434322934315| 0.8060572211299488|49.69342859718089|
|    min|              0.42|               0.0|                0.0|              0.0|
|    max|              80.0|               8.0|                6.0|         512.3292|
+-------+------------------+------------------+-------------------+-----------------+



In [10]:
# Fill Embarked Null 
passengers = passengers.withColumn("Embarked",
                                   when(passengers.PassengerId.isin('62'), "C")
                                   .otherwise(passengers.Embarked))

passengers = passengers.withColumn("Embarked",
                                   when(passengers.PassengerId.isin('830'), "C")
                                   .otherwise(passengers.Embarked))

In [11]:
passengers = passengers.withColumn("sex",
                                   when(passengers.Sex.isin('male'), "1")
                                   .otherwise('0'))

In [75]:
# extract Cabin first char as deck
passengers = passengers.withColumn("deck", 
                                   when(passengers.Cabin.isNull(), "null")
                                   .otherwise(passengers.Cabin.substr(0, 2)))

In [None]:
# mice imputation
# 多重插值法

In [45]:
passengers.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- sex: string (nullable = false)
 |-- age: double (nullable = true)
 |-- sibsp: double (nullable = true)
 |-- parch: double (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [91]:
passengers.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [89]:
passengers.select(passengers.Name).filter(passengers.Name.like('%Miss%')).show()

+--------------------+
|                Name|
+--------------------+
|Heikkinen, Miss. ...|
|Sandstrom, Miss. ...|
|Bonnell, Miss. El...|
|Vestrom, Miss. Hu...|
|"McGowan, Miss. A...|
|Palsson, Miss. To...|
|"O'Dwyer, Miss. E...|
|Glynn, Miss. Mary...|
|Vander Planke, Mi...|
|Nicola-Yarred, Mi...|
|Laroche, Miss. Si...|
|Devaney, Miss. Ma...|
|O'Driscoll, Miss....|
|   Rugg, Miss. Emily|
|West, Miss. Const...|
| Icard, Miss. Amelie|
|Andersson, Miss. ...|
|Goodwin, Miss. Li...|
|Dowdell, Miss. El...|
|McDermott, Miss. ...|
+--------------------+
only showing top 20 rows



In [51]:
# extract Cabin first char as deck
passengers = passengers.withColumn("embarked", 
                                   when(passengers.Embarked.isin("Q"), 0)
                                   .when(passengers.Embarked.isin("C"), 1)
                                   .when(passengers.Embarked.isin("S"), 2))

In [77]:
# full$Mother <- 'Not Mother'
# full$Mother[full$Sex == 'female' & full$Parch > 0 & full$Age > 18 & full$Title != 'Miss'] <- 'Mother'
passengers = passengers.withColumn("mother", 
                                   when( ( (passengers.sex == 0) & 
                                           (passengers.age > 18) & 
                                           (passengers.Name.like('%Miss%') ) ), 1)
                                   .otherwise(0))

In [78]:
passengers.select(passengers.PassengerId, passengers.Embarked).filter((passengers.PassengerId == '830') | (passengers.PassengerId == '62')).collect()

[Row(PassengerId='62', Embarked='C'), Row(PassengerId='830', Embarked='C')]

In [83]:
lab = passengers
lab = lab.drop('Name')
lab = lab.drop('Ticket')
lab = lab.drop('Cabin')

In [20]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

In [84]:
def parse(l):
    target = l.Survived
    features = [l.sex, l.Pclass, l.sibsp, l.parch, l.fare, l.embarked, l.mother]
    return LabeledPoint(target, features)

lab = lab.rdd.map(lambda x: parse(x))

In [85]:
lab.take(num=10)

[LabeledPoint(0.0, [1.0,3.0,1.0,0.0,7.25,2.0,0.0]),
 LabeledPoint(1.0, [0.0,1.0,1.0,0.0,71.2833,1.0,0.0]),
 LabeledPoint(1.0, [0.0,3.0,0.0,0.0,7.925,2.0,1.0]),
 LabeledPoint(1.0, [0.0,1.0,1.0,0.0,53.1,2.0,0.0]),
 LabeledPoint(0.0, [1.0,3.0,0.0,0.0,8.05,2.0,0.0]),
 LabeledPoint(0.0, [1.0,3.0,0.0,0.0,8.4583,0.0,0.0]),
 LabeledPoint(0.0, [1.0,1.0,0.0,0.0,51.8625,2.0,0.0]),
 LabeledPoint(0.0, [1.0,3.0,3.0,1.0,21.075,2.0,0.0]),
 LabeledPoint(1.0, [0.0,3.0,0.0,2.0,11.1333,2.0,0.0]),
 LabeledPoint(1.0, [0.0,2.0,1.0,0.0,30.0708,1.0,0.0])]

In [86]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = lab.randomSplit([0.7, 0.3])

In [87]:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={0:2, 1:4, 5:3, 6:2},
                                     numTrees=10, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)

In [88]:
def f(x):
    (v, p) = x
    if v != p:
        return True
    else:
        return False
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda x: f(x)).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

# Save and load model
# model.save(sc, "target/tmp/myRandomForestClassificationModel")
# sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")

Test Error = 0.20588235294117646
Learned classification forest model:
TreeEnsembleModel classifier with 10 trees

  Tree 0:
    If (feature 4 <= 10.1708)
     If (feature 4 <= 7.5208)
      If (feature 0 in {0.0})
       If (feature 4 <= 7.0458)
        Predict: 0.0
       Else (feature 4 > 7.0458)
        Predict: 1.0
      Else (feature 0 not in {0.0})
       If (feature 5 in {0.0,2.0})
        Predict: 0.0
       Else (feature 5 not in {0.0,2.0})
        Predict: 0.0
     Else (feature 4 > 7.5208)
      If (feature 3 <= 1.0)
       If (feature 0 in {0.0})
        Predict: 1.0
       Else (feature 0 not in {0.0})
        Predict: 0.0
      Else (feature 3 > 1.0)
       Predict: 1.0
    Else (feature 4 > 10.1708)
     If (feature 6 in {1.0})
      If (feature 2 <= 1.0)
       If (feature 1 in {3.0})
        Predict: 0.0
       Else (feature 1 not in {3.0})
        Predict: 1.0
      Else (feature 2 > 1.0)
       Predict: 0.0
     Else (feature 6 not in {1.0})
      If (feature 4 <= 52

In [92]:
# sex change to person and add child type when age < 16

In [None]:
# https://www.kaggle.com/omarelgabry/a-journey-through-titanic
# drop(['PassengerId','Name','Ticket']
# fill Embarked with S because S is the most seen value.
# drop Embarked value is S
# if Parch + SibSp > 0 : family = 1 else family = 0
# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
# drop p-class 3