In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
import pandas as pd
pd.set_option('max_colwidth',300)

In [3]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

### Step 1
- Load the train and test sets
- Check the schema, the variables have their right types?
- If not, how to correctly load the datasets?

In [4]:
### INSERT YOUR CODE HERE
train = sqlContext.read.format('com.databricks.spark.csv')\
                .option('header', 'true')\
                .option('inferschema', 'true')\
                .option('mode', 'DROPMALFORMED')\
                .load('../data/titanic/train.csv')
train.show()
### INSERT YOUR CODE HERE
test = sqlContext.read.format('com.databricks.spark.csv')\
                .option('header', 'true')\
                .option('inferschema', 'true')\
                .option('mode', 'DROPMALFORMED')\
                .load('../data/titanic/test.csv')
test.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [5]:
train.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Step 2
- Explore the features of your dataset
- You can use DataFrame's ***describe*** method to get summary statistics
    - hint: ***toPandas*** may be useful to ease the manipulation of small dataframes
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: ***na*** property of DataFrames provide several methods of handling NA values

In [6]:
# Calculating summary statistics and turning it into Pandas DF
train_desc = train.describe().toPandas().set_index('summary')
train_desc

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
mean,446.0,0.3838383838383838,2.308641975308642,,,29.69911764705882,0.5230078563411896,0.3815937149270482,260318.54916792738,32.2042079685746,,
stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,,,14.526497332334037,1.1027434322934315,0.8060572211299488,471609.26868834975,49.69342859718089,,
min,1.0,0.0,1.0,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""",female,0.42,0.0,0.0,110152,0.0,A10,C
max,891.0,1.0,3.0,"van Melkebeke, Mr. Philemon",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [7]:
# Computing correlations between Survived and some features
print({col:train.stat.corr('Survived',col) for col in ['Pclass','Age','SibSp','Parch','Fare']})

# Checking which columns have NULL values
print({col:train.where(train[col].isNull()).count() for col in train.columns})

# Taking the mean age from the Pandas DF
ageMean = float(train_desc.loc['mean']['Age'])
print(ageMean)

embarkedMode = train.groupby("Embarked").count().sort("count", ascending=False).take(1)[0][0]

print(embarkedMode)

{'Pclass': -0.3384810359610151, 'Age': 0.010539215871285682, 'SibSp': -0.0353224988857356, 'Parch': 0.08162940708348339, 'Fare': 0.2573065223849626}
{'PassengerId': 0, 'Survived': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 177, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 0, 'Cabin': 687, 'Embarked': 2}
29.69911764705882
S


In [110]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

@pandas_udf(train.schema, PandasUDFType.GROUPED_MAP)
# Input/output are both a pandas.DataFrame
def pandas_fill_age(pdf):
    return pdf.assign(Age=pdf.Age.fillna(pdf.Age.mean()))

train.groupby('Pclass', 'Sex').apply(pandas_fill_age).sort('PassengerId').limit(10).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,26.507589,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [66]:
trainFilled = train.na.fill({"Age": ageMean, "Embarked": embarkedMode})\
.drop("Cabin")
# testFilled = test.na.fill({"Age": ageMean, "Embarked": embarkedMode})\
# .drop("Cabin")

print({col:trainFilled.where(trainFilled[col].isNull()).count() for col in trainFilled.columns})

{'PassengerId': 0, 'Survived': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 0, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 0, 'Embarked': 0}


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,26.507589,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Step 3
- How to handle categorical features?
    - hint: check the Estimators and Transformers
- Assemble all desired features into a Vector using the VectorAssembler Transformer
- Make sure to end up with a DataFrame with two columns: ***Survived*** and ***vFeatures***

In [9]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils

# m1 = StringIndexer(inputCol="Name", outputCol="Name1").fit(trainFilled)
# transformed=m1.transform(trainFilled)

m2 = StringIndexer(inputCol="Sex", outputCol="Sex1").fit(trainFilled)
transformed=m2.transform(trainFilled)

m3 = OneHotEncoder(dropLast=True, inputCol="Sex1", outputCol="Sex2")
transformed=m3.transform(transformed)

m4 = StringIndexer(inputCol="Embarked", outputCol="Embarked1", handleInvalid="skip").fit(transformed)
transformed=m4.transform(transformed)

m5 = OneHotEncoder(dropLast=False, inputCol="Embarked1", outputCol="Embarked2")
transformed=m5.transform(transformed)

m6 = StringIndexer(inputCol="Ticket", outputCol="Ticket1").fit(transformed)
transformed=m6.transform(transformed)

m7 = OneHotEncoder(dropLast=False, inputCol="Pclass", outputCol="Pclass1")
transformed=m7.transform(transformed)

transformed.select("Embarked2", "Sex2", "Ticket1", "Pclass1").show()

+-------------+-------------+-------+-------------+
|    Embarked2|         Sex2|Ticket1|      Pclass1|
+-------------+-------------+-------+-------------+
|(3,[0],[1.0])|(1,[0],[1.0])|  257.0|(4,[3],[1.0])|
|(3,[1],[1.0])|    (1,[],[])|  608.0|(4,[1],[1.0])|
|(3,[0],[1.0])|    (1,[],[])|  292.0|(4,[3],[1.0])|
|(3,[0],[1.0])|    (1,[],[])|   46.0|(4,[1],[1.0])|
|(3,[0],[1.0])|(1,[0],[1.0])|  425.0|(4,[3],[1.0])|
|(3,[2],[1.0])|(1,[0],[1.0])|  269.0|(4,[3],[1.0])|
|(3,[0],[1.0])|(1,[0],[1.0])|  438.0|(4,[1],[1.0])|
|(3,[0],[1.0])|(1,[0],[1.0])|   12.0|(4,[3],[1.0])|
|(3,[0],[1.0])|    (1,[],[])|   27.0|(4,[3],[1.0])|
|(3,[1],[1.0])|    (1,[],[])|   55.0|(4,[2],[1.0])|
|(3,[0],[1.0])|    (1,[],[])|   65.0|(4,[3],[1.0])|
|(3,[0],[1.0])|    (1,[],[])|  510.0|(4,[1],[1.0])|
|(3,[0],[1.0])|(1,[0],[1.0])|  658.0|(4,[3],[1.0])|
|(3,[0],[1.0])|(1,[0],[1.0])|    0.0|(4,[3],[1.0])|
|(3,[0],[1.0])|    (1,[],[])|  635.0|(4,[3],[1.0])|
|(3,[0],[1.0])|    (1,[],[])|  336.0|(4,[2],[1.0])|
|(3,[2],[1.0

In [10]:
# Using a VectorAssembler to put together all feature columns
assembler = VectorAssembler(inputCols = ["Pclass1","Sex2","Age","SibSp",
                                         "Parch","Fare","Embarked2"
                                        ], outputCol = "vfeatures")

assembled = assembler.transform(transformed)

In [11]:
# Keeping only the features and label columns to
assembled2 = assembled.select("Survived","vfeatures")

In [12]:
assembled2.limit(5).toPandas()

Unnamed: 0,Survived,vfeatures
0,0,"(0.0, 0.0, 0.0, 1.0, 1.0, 22.0, 1.0, 0.0, 7.25, 1.0, 0.0, 0.0)"
1,1,"(0.0, 1.0, 0.0, 0.0, 0.0, 38.0, 1.0, 0.0, 71.2833, 0.0, 1.0, 0.0)"
2,1,"(0.0, 0.0, 0.0, 1.0, 0.0, 26.0, 0.0, 0.0, 7.925, 1.0, 0.0, 0.0)"
3,1,"(0.0, 1.0, 0.0, 0.0, 0.0, 35.0, 1.0, 0.0, 53.1, 1.0, 0.0, 0.0)"
4,0,"(0.0, 0.0, 0.0, 1.0, 1.0, 35.0, 0.0, 0.0, 8.05, 1.0, 0.0, 0.0)"


In [13]:
#Same as above steps. compact

from pyspark.ml import Pipeline

# only string indexing binary columns
# 

binary = ['Sex','Embarked']
categorical = ['Pclass', 'SibSp', 'Embarked'] 
categorical = [c + '_index' if c in binary else c for c in categorical]

indexers = [StringIndexer(inputCol=col, outputCol=col +"_index") for col in binary]
encoder_indexers = indexers + [OneHotEncoder(inputCol=column, outputCol=column+"_encoded") for column in categorical] 
encoder_pipeline = Pipeline(stages = encoder_indexers)

compact_encoded = encoder_pipeline.fit(trainFilled).transform(trainFilled)
compact_encoded.limit(3).toPandas()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Sex_index,Embarked_index,Pclass_encoded,SibSp_encoded,Embarked_index_encoded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0.0,0.0,"(0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C,1.0,1.0,"(0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1.0,0.0,"(0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)"


### Step 4
- Apply a normalization Estimator of your choice to the ***features*** vector obtained in Step 3

In [14]:
from pyspark.ml.feature import StandardScaler

scalar2 = StandardScaler().setInputCol("vfeatures").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)
scalar2Model = scalar2.fit(assembled)
sclaed = scalar2Model.transform(assembled)

sclaed.select("vfeatures","scaledFeat").limit(2).toPandas()

Unnamed: 0,vfeatures,scaledFeat
0,"(0.0, 0.0, 0.0, 1.0, 1.0, 22.0, 1.0, 0.0, 7.25, 1.0, 0.0, 0.0)","[0.0, -0.565367891708, -0.509865182364, 0.902080720168, 0.73728104523, -0.592148025766, 0.432550428042, -0.473407724568, -0.502163136516, 0.61549273983, -0.481772097386, -0.307389700253]"
1,"(0.0, 1.0, 0.0, 0.0, 0.0, 38.0, 1.0, 0.0, 71.2833, 0.0, 1.0, 0.0)","[0.0, 1.76677466159, -0.509865182364, -1.10730408401, -1.35481262133, 0.638430443947, 0.432550428042, -0.473407724568, 0.786403617835, -1.62289106094, 2.07334063339, -0.307389700253]"


### Step 5
- Instead of doing transformations on separate steps, put everything together with a Pipeline

In [15]:
from pyspark.ml.pipeline import Pipeline

pipeline = Pipeline(stages=[
                            m2,
                            m3, 
                            m4, 
                            m5,
#                             m6,
                            m7,
                            assembler,
                            scalar2
                           ])

model = pipeline.fit(trainFilled)
scaled = model.transform(trainFilled)
scaled.limit(1).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Sex1,Sex2,Embarked1,Embarked2,Pclass1,vfeatures,scaledFeat
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0.0,(1.0),0.0,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 1.0, 22.0, 1.0, 0.0, 7.25, 1.0, 0.0, 0.0)","[0.0, -0.565367891708, -0.509865182364, 0.902080720168, 0.73728104523, -0.592148025766, 0.432550428042, -0.473407724568, -0.502163136516, 0.61549273983, -0.481772097386, -0.307389700253]"


### Step 6
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the evaluators to find the Area Under ROC and Accuracy of your model
- How is your model performing? Try to tune its parameters

In [16]:
trainFilled.count()

891

In [17]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import functions as F
        
rfc = RandomForestClassifier().setLabelCol("Survived")\
            .setFeaturesCol('scaledFeat')\
            .setNumTrees(10)


pipeline = Pipeline(stages=[
                            m2,
                            m3, 
                            m4, 
                            m5,
#                             m6,
                            m7,
                            assembler,
                            scalar2,
                            rfc
                           ])

model = pipeline.fit(trainFilled)

predictionsRFC = model.transform(trainFilled)

predictionsRFC.select('Survived','prediction').limit(10).toPandas()

Unnamed: 0,Survived,prediction
0,0,0.0
1,1,1.0
2,1,0.0
3,1,1.0
4,0,0.0
5,0,0.0
6,0,0.0
7,0,0.0
8,1,1.0
9,1,1.0


In [18]:
#Accuracy
def accuracy(predictions):
    a = predictions.withColumn('foo',(predictions['Survived'] - predictions['prediction']))
    absSum = predictions.withColumn("absDiff", F.expr('abs(Survived - prediction)')) 
    return 1-absSum.select('absDiff').agg({'*':'sum'}).collect()[0][0]/predictions.count()

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Defines an evaluator based on the metric areaUnderROC
evaluator = BinaryClassificationEvaluator().setLabelCol("Survived") \
                            .setRawPredictionCol("rawPrediction") \
                            .setMetricName("areaUnderROC")
        
# Evaluate the predictions
roc = evaluator.evaluate(predictionsRFC)

print("validation roc")
print(roc)   

validation roc
0.9024196039582867


### Step 7
- Take a look at the test data - use DataFrame's ***createOrReplaceTempView*** method to perform SQL queries over the data
    - hint: check if there are any NULL values in the dataset - if so, handle them
- Apply the transformations to the test data
    - hint: include the model to the pipeline
- Make predictions using the model previously trained and the transformed test data

In [20]:
# Checking which columns have NULL values
print({col:testFilled.where(testFilled[col].isNull()).count() for col in testFilled.columns})

# Taking the mean age from the Pandas DF
fareMean = float(train_desc.loc['mean']['Fare'])
print(fareMean)

testFilled = testFilled.na.fill({"Fare": fareMean})

# Checking which columns have NULL values
print({col:testFilled.where(testFilled[col].isNull()).count() for col in testFilled.columns})

testFilled.describe().toPandas()

{'PassengerId': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 0, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 1, 'Embarked': 0}
32.2042079685746
{'PassengerId': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 0, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 0, 'Embarked': 0}


Unnamed: 0,summary,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,count,418.0,418.0,418,418,418.0,418.0,418.0,418,418.0,418
1,mean,1100.5,2.2655502392344498,,,30.15460315226568,0.4473684210526316,0.3923444976076555,223850.98986486485,35.61899954059464,
2,stddev,120.81045760473994,0.8418375519640503,,,12.636665857359937,0.8967595611217135,0.9814288785371694,369523.7764694362,55.84075146716072,
3,min,892.0,1.0,"""Assaf Khalil, Mrs. Mariana (Miriam"""")""""""",female,0.17,0.0,0.0,110469,0.0,C
4,max,1309.0,3.0,"van Billiard, Master. Walter John",male,76.0,8.0,9.0,W.E.P. 5734,512.3292,S


In [21]:
testPredictions = model.transform(testFilled)
testPredictions.toPandas().head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Sex1,Sex2,Embarked1,Embarked2,Pclass1,vfeatures,scaledFeat,rawPrediction,probability,prediction
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,0.0,(1.0),2.0,"(0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 1.0, 34.5, 0.0, 0.0, 7.8292, 0.0, 0.0, 1.0)","[0.0, -0.565367891708, -0.509865182364, 0.902080720168, 0.73728104523, 0.369241403697, -0.474278822276, -0.473407724568, -0.490507671873, -1.62289106094, -0.481772097386, 3.24954825982]","[8.50378778221, 1.49621221779]","[0.850378778221, 0.149621221779]",0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,1.0,(0.0),0.0,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 47.0, 1.0, 0.0, 7.0, 1.0, 0.0, 0.0)","[0.0, -0.565367891708, -0.509865182364, 0.902080720168, -1.35481262133, 1.33063083316, 0.432550428042, -0.473407724568, -0.507193982788, 0.61549273983, -0.481772097386, -0.307389700253]","[5.64087165297, 4.35912834703]","[0.564087165297, 0.435912834703]",0.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,0.0,(1.0),2.0,"(0.0, 0.0, 1.0)","(0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 1.0, 62.0, 0.0, 0.0, 9.6875, 0.0, 0.0, 1.0)","[0.0, -0.565367891708, 1.9591015431, -1.10730408401, 0.73728104523, 2.48429814852, -0.474278822276, -0.473407724568, -0.453112385364, -1.62289106094, -0.481772097386, 3.24954825982]","[8.77087814536, 1.22912185464]","[0.877087814536, 0.122912185464]",0.0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,0.0,(1.0),0.0,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 1.0, 27.0, 0.0, 0.0, 8.6625, 1.0, 0.0, 0.0)","[0.0, -0.565367891708, -0.509865182364, 0.902080720168, 0.73728104523, -0.207592253981, -0.474278822276, -0.473407724568, -0.473738855079, 0.61549273983, -0.481772097386, -0.307389700253]","[8.68794231338, 1.31205768662]","[0.868794231338, 0.131205768662]",0.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,1.0,(0.0),0.0,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 22.0, 1.0, 1.0, 12.2875, 1.0, 0.0, 0.0)","[0.0, -0.565367891708, -0.509865182364, 0.902080720168, -1.35481262133, -0.592148025766, 0.432550428042, 0.767198988933, -0.400791584135, 0.61549273983, -0.481772097386, -0.307389700253]","[4.92035542062, 5.07964457938]","[0.492035542062, 0.507964457938]",1.0


### Step 8

- Load the answers for the ***test*** data
- Combine it with your predictions into a single DataFrame
- Use the evaluator you created on ***Step 6***
- What was your score?

In [22]:
testAnswers = sqlContext.read.format('com.databricks.spark.csv')\
                .option('header', 'true')\
                .option('inferschema', 'true')\
                .option('mode', 'DROPMALFORMED')\
                .load('titanic_answers.csv')
                
joined = testAnswers.join(testPredictions, on="PassengerId")

# Evaluate the predictions
testRoc = evaluator.evaluate(joined)

print("test roc")
print(testRoc)   

print("test acc")
print(accuracy(joined))   

test roc
0.8194741966893864
test acc
0.7799043062200957


In [111]:
sc.stop()