In [295]:
import org.apache.spark.sql.types.{StructField, StructType, DoubleType, IntegerType}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrameNaFunctions
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.DecisionTreeClassifier

val schema = new StructType(Array(
    new StructField("survived", StringType, true),
    new StructField("sex", StringType, true),
    new StructField("age", IntegerType, true),
    new StructField("pclass", StringType, true),
    new StructField("name", StringType, true),
    new StructField("sibsp", IntegerType, true),
    new StructField("parch", IntegerType, true),
    new StructField("ticket", IntegerType, true),
    new StructField("fare", DoubleType, true),
    new StructField("cabin", StringType, true),
    new StructField("embarked", StringType, true),
    new StructField("boat", IntegerType, true),
    new StructField("body", StringType, true),
    new StructField("home_dest", StringType, true)
))

val data = spark.read.format("csv").schema(schema).option("header",true).option("sep","\t").load("titanic.tsv")

In [215]:
data.show(1)

+--------+------+---+------+--------------------+-----+-----+------+--------+-----+--------+----+----+------------+
|survived|   sex|age|pclass|                name|sibsp|parch|ticket|    fare|cabin|embarked|boat|body|   home_dest|
+--------+------+---+------+--------------------+-----+-----+------+--------+-----+--------+----+----+------------+
|       y|female| 29| first|Allen, Miss. Elis...|    0|    0| 24160|211.3375|   B5|       S|   2|null|St Louis, MO|
+--------+------+---+------+--------------------+-----+-----+------+--------+-----+--------+----+----+------------+
only showing top 1 row



# 1.  
Did age have any affect on the survival of the passengers? Divide the passengers into age  
groups spanning 5 years each - \[0, 5), \[5, 10), \[10, 15), … . For each group compute the  
number of passengers in each group. Then compute the percent of survivors in each group.

In [218]:
def ageRangeSurvived(A: Int, B: Int) = {
    val dropped = data.na.drop("any",Seq("age"))
    val count = dropped.filter(f"age >= $A AND age < $B").count.toFloat
    val surv = dropped.filter(f"age >= $A AND age < $B").filter("survived == 'y'").count.toFloat
    
    println(f"ages $A to $B count: $count")
    println(f"survived count: $surv")
    println(f"percent survived: ${surv/count}\n")
}

In [219]:
var age = 0
while (age+5 <= 80) {
    ageRangeSurvived(age, age+5)
    age+=5
}

ages 0 to 5 count: 39.0
survived count: 23.0
percent survived: 0.5897436

ages 5 to 10 count: 31.0
survived count: 17.0
percent survived: 0.5483871

ages 10 to 15 count: 24.0
survived count: 11.0
percent survived: 0.45833334

ages 15 to 20 count: 113.0
survived count: 45.0
percent survived: 0.39823008

ages 20 to 25 count: 180.0
survived count: 71.0
percent survived: 0.39444444

ages 25 to 30 count: 156.0
survived count: 56.0
percent survived: 0.35897437

ages 30 to 35 count: 124.0
survived count: 53.0
percent survived: 0.42741936

ages 35 to 40 count: 97.0
survived count: 43.0
percent survived: 0.44329897

ages 40 to 45 count: 66.0
survived count: 20.0
percent survived: 0.3030303

ages 45 to 50 count: 64.0
survived count: 32.0
percent survived: 0.5

ages 50 to 55 count: 43.0
survived count: 21.0
percent survived: 0.4883721

ages 55 to 60 count: 26.0
survived count: 11.0
percent survived: 0.42307693

ages 60 to 65 count: 26.0
survived count: 10.0
percent survived: 0.3846154

ages 65 to

# 2.  
Logistic on age. Using logistic regression with independent variable age and dependent  
variable survived create a model to classify passengers as survivors.

In [246]:
val avgAge = data.select(mean("age")).first().getDouble(0).round
println(avgAge)

30


In [281]:
val survAge = data.select("survived","age")

In [282]:
// Filling null with avg age wasn't working, so dropped instead
//val fillAge = survAge.na.fill(avgAge,Seq("age"))
//fillAge.show(5)

val dropAge = survAge.na.drop("any",Seq("age"))

In [283]:
val myFormala = new RFormula().setFormula("survived ~ age") 
val fittedRF = myFormala.fit(dropAge)
val preparedDF = fittedRF.transform(dropAge)

val Array(train, test) = preparedDF.randomSplit(Array(0.7, 0.3))

In [284]:
val lr = new LogisticRegression()
val lrModel = lr.fit(train)

In [285]:
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

Coefficients: [-0.0029284445414183716] Intercept: -0.3169927674483765


In [286]:
val predictions = lrModel.evaluate(test).predictions
val wrongPredictions = predictions.where(expr("label != prediction"))

val countErrors = wrongPredictions.groupBy("label").agg(count("prediction").alias("Errors"))
countErrors.show

+-----+------+
|label|Errors|
+-----+------+
|  1.0|   135|
+-----+------+



In [287]:
val rightPredictions = predictions.where(expr("label == prediction"))
val countCorrect = rightPredictions.groupBy("label").agg(count("prediction").alias("Correct"))
countCorrect.show

+-----+-------+
|label|Correct|
+-----+-------+
|  0.0|    166|
+-----+-------+



# 3.  
Logistic on age, sex and pclass. Same as problem two but use independent variables sex,  
age, and pclass. Since sex and pclass are categorical they need special treatment.  

In [289]:
val ageSexPcl = data.select("survived","age","sex","pclass")

In [290]:
val dropP3 = ageSexPcl.na.drop("any",Seq("age"))

In [291]:
val myFormala = new RFormula().setFormula("survived ~ age + sex + pclass") 
val fittedRF = myFormala.fit(dropP3)
val preparedDF = fittedRF.transform(dropP3)

val Array(train, test) = preparedDF.randomSplit(Array(0.7, 0.3))

In [292]:
val lr = new LogisticRegression()
val lrModel = lr.fit(train)

println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

Coefficients: [-0.03584821831384887,-2.423412661170741,-1.0407856587945303,1.360953595290399] Intercept: 2.262976288959575


In [293]:
val predictions = lrModel.evaluate(test).predictions
val wrongPredictions = predictions.where(expr("label != prediction"))
val countErrors = wrongPredictions.groupBy("label").agg(count("prediction").alias("Errors"))
countErrors.show

+-----+------+
|label|Errors|
+-----+------+
|  0.0|    31|
|  1.0|    31|
+-----+------+



In [294]:
val rightPredictions = predictions.where(expr("label == prediction"))
val countCorrect = rightPredictions.groupBy("label").agg(count("prediction").alias("Correct"))
countCorrect.show

+-----+-------+
|label|Correct|
+-----+-------+
|  0.0|    143|
|  1.0|     88|
+-----+-------+



# 4.  
Decision tree. Instead of using logistic regression use Decision tree with the independent  
variables sex, age, and pclass.  

In [296]:
val decTree = data.select("survived","age","sex","pclass")
val dropP4 = decTree.na.drop("any",Seq("age"))

In [297]:
val myFormala = new RFormula().setFormula("survived ~ age + sex + pclass") 
val fittedRF = myFormala.fit(dropP4)
val preparedDF = fittedRF.transform(dropP4)

val Array(train, test) = preparedDF.randomSplit(Array(0.7, 0.3))

In [298]:
val dt = new DecisionTreeClassifier()
val dtModel = dt.fit(train)

In [299]:
val predictions = dtModel.transform(test)
val wrongPredictions = predictions.where(expr("label != prediction"))
val countErrors = wrongPredictions.groupBy("label").agg(count("prediction").alias("Errors"))
countErrors.show

+-----+------+
|label|Errors|
+-----+------+
|  0.0|    19|
|  1.0|    46|
+-----+------+



In [300]:
val rightPredictions = predictions.where(expr("label == prediction"))
val countCorrect = rightPredictions.groupBy("label").agg(count("prediction").alias("Correct"))
countCorrect.show

+-----+-------+
|label|Correct|
+-----+-------+
|  0.0|    150|
|  1.0|     76|
+-----+-------+



# 5.  
How do the models created in problems 2-4 compare based on the false positives & false  
negatives the produce on your test data.

Problem 2: Logistic Regression (survived ~ age)  
  - accuracy = 55.15%  
  - false pos = 44.85%  
  - false neg = 0%  
  
Problem 3: Logistic Regression (survived ~ age + sex + pclass)  
  - accuracy = 78.84%  
  - false pos = 17.82%  
  - false neg = 26.05%  
  
Problem 4: Decision Tree (survived ~ age + sex + pclass)  
  - accuracy = 77.66%  
  - false pos = 23.47%  
  - false neg = 20.00%  