In [1]:
import pyspark
import findspark
import time
import os.path
from itertools import chain
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, OneHotEncoder, SQLTransformer
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext, Row
from pyspark.sql import functions as F
from pyspark.sql.functions import concat,translate,lit,col,isnan,count,when,split,explode,ltrim,create_map

In [2]:
#initialise spark
findspark.init()
sc = pyspark.SparkContext(appName='Classifier')
sql = pyspark.SQLContext(sc)

In [3]:
def create_dataframes(directory):
    """
    Creates dataframes from directory
    Must be named 'train' or 'test'. 
    Returns only train if test N/A
    
    Inputs: String 
    
    Returns: Dataframes/Dataframe
    
    """
    if os.path.exists(directory):
        train = directory+"/train.csv"
        if os.path.exists(train):
            df_train = sql.read.csv(train, 
                         header = True,
                         inferSchema = True)
        else:
            raise ValueError("train.csv not found in %s" % directory)
        
        test = directory+"/test.csv"
        if os.path.exists(test):
            df_test = sql.read.csv(train, 
                         header = True,
                         inferSchema = True)
            return df_train,df_test
        
        return df_train
        
    else:
        raise ValueError("%s does not exist" % directory)   

In [4]:
df_train,df_test= create_dataframes('./data')

In [5]:
# combine train and test
df_train = df_train.withColumn('Mark',lit('train'))
df_test  = df_test.withColumn('Mark',lit('test'))
df = df_train.unionAll(df_test)

In [6]:
#missing values by column
for column in df.columns:
    missing = df.where(df[column].isNull()).count()
    print("Missing values for %s : %s" % (column,missing))

Missing values for PassengerId : 0
Missing values for Survived : 0
Missing values for Pclass : 0
Missing values for Name : 0
Missing values for Sex : 0
Missing values for Age : 354
Missing values for SibSp : 0
Missing values for Parch : 0
Missing values for Ticket : 0
Missing values for Fare : 0
Missing values for Cabin : 1374
Missing values for Embarked : 4
Missing values for Mark : 0


In [7]:
#cabin has a high amount of missing values so I will remove it 
df = df.drop('Cabin')

In [8]:
#fill missing values with the mean
def fill_null_with_mean(df):
    """
    Replaces null numeric values with
    mean value
    Replaces categorical string values
    with mode
    input: spark dataframe
    returns: spark dataframe
    
    """
    
    x = df.cache()
    
    for column in df.schema.fields:
        dtype = "%s" % column.dataType
        if dtype != "StringType":
            mean = df.groupBy().mean(column.name).first()[0]
            x = x.na.fill({column.name:mean})
        else:
            counts = df.groupBy(column.name).count()
            mode = counts.join(
            counts.agg(F.max("count").alias("max_")),
            col("count") == col("max_")
            ).limit(1).select(column.name)
            x = x.na.fill({column.name:mode.first()[0]})     
    return x

df = fill_null_with_mean(df)

In [9]:
#Title cleanse 
df = df.withColumn('Surname',split('Name',', ')[0])
df = df.withColumn('name_split',split('Name',', ')[1])
df = df.withColumn('Title',ltrim(split('name_split','. ')[0]))
df = df.drop('name_split')
title_dictionary = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Sir",
    "Don":        "Sir",
    "Sir" :       "Sir",
    "Dr":         "Mr",
    "Rev":        "Mr",
    "the Countess":"Lady",
    "Dona":       "Lady",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Lady"
}

#x = df['Title'].map(Title_Dictionary)
mapping_expr = create_map([lit(x) for x in chain(*title_dictionary.items())])

df = df.withColumn("Title", mapping_expr.getItem(col("Title")))

In [10]:
# create binary column 'Mother'
df = df.withColumn('Mother',when((df['Sex'] =='female')&
                                (df['Age'] > 18)&
                                (df['Parch'] > 0)
                                 ,'Mother').otherwise('null'))

#create a family size column
df = df.withColumn('Family_size',(df['SibSp'] + df['Parch'] + 1))

# create a family id column
df = df.withColumn('Family_id',when(df['Family_size']>2,
                                   (concat(df['Surname'],
                                    df['Family_size']))).otherwise('null'))

In [11]:
df.select('Title','Mother','Family_size','Family_id').show(10)

+------+------+-----------+---------+
| Title|Mother|Family_size|Family_id|
+------+------+-----------+---------+
|    Mr|  null|          2|     null|
|   Mrs|  null|          2|     null|
|  Miss|  null|          1|     null|
|   Mrs|  null|          2|     null|
|    Mr|  null|          1|     null|
|    Mr|  null|          1|     null|
|    Mr|  null|          1|     null|
|Master|  null|          5| Palsson5|
|   Mrs|Mother|          3| Johnson3|
|   Mrs|  null|          2|     null|
+------+------+-----------+---------+
only showing top 10 rows



In [12]:
#drop columns 
df = df.drop('PassengerId','Ticket','Surname')

In [50]:
print(df.columns)
print(df.schema.fields)

['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Mark', 'Title', 'Mother', 'Family_size', 'Family_id']
[StructField(Survived,IntegerType,true), StructField(Pclass,IntegerType,true), StructField(Name,StringType,false), StructField(Sex,StringType,false), StructField(Age,DoubleType,false), StructField(SibSp,IntegerType,true), StructField(Parch,IntegerType,true), StructField(Fare,DoubleType,false), StructField(Embarked,StringType,false), StructField(Mark,StringType,false), StructField(Title,StringType,true), StructField(Mother,StringType,false), StructField(Family_size,IntegerType,true), StructField(Family_id,StringType,true)]


In [68]:
def split_on_column_types(df):
    """
    Create array of numeric and string
    
    """
    
    string = []
    numeric = []
    
    for col in df.schema.fields:
        x = "%s" % col.dataType
        if x == "StringType":
            string.append(col.name)
        else:
            numeric.append(col.name)
            
            
    return string,numeric

categorical,numeric = split_on_column_types(df)
categorical.remove('Mark')
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in categorical]
encoders = [OneHotEncoder(inputCol=column+"_index",outputCol=column+"_vec") for column in categorical]

In [69]:
indexers.extend(encoders)
print(indexers)

[StringIndexer_4d37a1bd3635c2fb2b7d, StringIndexer_462284fed8a282f0caf8, StringIndexer_4b12888a89ac1eda375e, StringIndexer_44ab8ebf2538add62582, StringIndexer_49b4a918a6283d3d4c1e, StringIndexer_4abda78e797ab568e003, OneHotEncoder_4fe5a8004b4be13d0435, OneHotEncoder_4b4b9f8b6d00e2129ed2, OneHotEncoder_4b4c8e420b4c21caff15, OneHotEncoder_455bb54394d87faac097, OneHotEncoder_43eeae02f7c512770f79, OneHotEncoder_4b25872e40ef9541ea90]


In [70]:
# #scale numeric columns
# from pyspark.ml.feature import StandardScaler
# scalers = [StandardScaler(inputCol=column, outputCol=column+"_index"
#                          ,withStd=False,withMean=False
#                          ).fit(df) for column in numeric]


In [71]:
df.show(2)

+--------+------+--------------------+------+----+-----+-----+-------+--------+-----+-----+------+-----------+---------+
|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked| Mark|Title|Mother|Family_size|Family_id|
+--------+------+--------------------+------+----+-----+-----+-------+--------+-----+-----+------+-----------+---------+
|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|train|   Mr|  null|          2|     null|
|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|train|  Mrs|  null|          2|     null|
+--------+------+--------------------+------+----+-----+-----+-------+--------+-----+-----+------+-----------+---------+
only showing top 2 rows



In [111]:
pipeline = Pipeline(stages = indexers)
df_indexed = pipeline.fit(df).transform(df)
from functools import reduce
from pyspark.sql import DataFrame
to_remove = categorical + ['Name_index','Sex_index',
                           'Embarked_index',
                           'Title_index','Mother_index',
                           'Family_id_index']
df_vec=reduce(DataFrame.drop,to_remove,df_indexed)

In [100]:
from pyspark.ml.classification import LogisticRegression
 
# regPara: lasso regularisation parameter (L1)
lr = LogisticRegression(maxIter = 100, regParam = 0.05, labelCol='index').fit(train)

Py4JJavaError: An error occurred while calling o3059.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 262.0 failed 1 times, most recent failure: Lost task 0.0 in stage 262.0 (TID 1879, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$5: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$5.apply(StringIndexer.scala:213)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$5.apply(StringIndexer.scala:208)
	... 23 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2119)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1026)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1008)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1128)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:517)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:487)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:278)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$5: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:215)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$5.apply(StringIndexer.scala:213)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$5.apply(StringIndexer.scala:208)
	... 23 more


In [114]:
sc.stop()