In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [45]:
spark = SparkSession.builder.appName('titanic-survivors').master('local[4]').getOrCreate()


In [46]:
df = spark.read.csv('../data/titanic.csv', inferSchema=True, header=True)

In [5]:
df.count()

891

In [6]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [11]:
for item, col in zip(df.head(1)[0], df.columns):
    print(col + ': ' + str(item))

PassengerId: 1
Survived: 0
Pclass: 3
Name: Braund, Mr. Owen Harris
Sex: male
Age: 22.0
SibSp: 1
Parch: 0
Ticket: A/5 21171
Fare: 7.25
Cabin: None
Embarked: S


### Take Subset of Columns

In [47]:
my_cols = df.select('Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked')

### Clean Up Missing Data

In [48]:
# going to just drop missing data
final_data = my_cols.na.drop()
final_data.count()

712

In [49]:
# we lost over a hundred records, more than 10%

### One Hot Encoding

In [50]:
from pyspark.ml.feature import (
    VectorAssembler,
    VectorIndexer,
    OneHotEncoder,
    StringIndexer)

In [51]:
# StringIndexer will assign a number to every unique string
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')

# OneHotEncoder will convert StringIndexer into vector binary values
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [52]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')



In [53]:
assembler = VectorAssembler(
    inputCols=['Pclass', 'SexVec', 'EmbarkedVec', 'Age', 'SibSp', 'Parch', 'Fare'],
    outputCol='features')


### Pipelines!

In [54]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [55]:
# pipeline sets stages for different tasks

In [56]:
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [57]:
pipeline = Pipeline(
    stages=[gender_indexer, embark_indexer,
          gender_encoder,embark_encoder,
          assembler, log_reg_titanic])

### Train / Test Split

In [58]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

### Train Model

In [59]:
fit_model = pipeline.fit(train_data)

### Evaluate Results

In [60]:
results = fit_model.transform(test_data)

In [61]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [62]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [64]:
auc = evaluator.evaluate(results)

In [65]:
auc

0.7445652173913043