In [502]:
import pyspark
import findspark
import time
import os.path
from itertools import chain
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.feature import StringIndexer,IndexToString, VectorIndexer, VectorAssembler, OneHotEncoder, SQLTransformer
from pyspark.ml.linalg import DenseVector
from pyspark.sql.types import StringType,StructField,IntegerType,DoubleType
from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext, Row
from pyspark.sql import functions as F
from pyspark.sql.functions import concat,translate,lit,col,isnan,count,when,split,explode,ltrim,create_map

In [195]:
#initialise spark
findspark.init()
sc = pyspark.SparkContext(appName='Classifier')
sql = pyspark.SQLContext(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Classifier, master=local[*]) created by __init__ at <ipython-input-26-8f4850d1539f>:3 

In [537]:
def create_dataframes(directory,schema_train=None,schema_test=None):
    """
    Creates dataframes from directory
    Must be named 'train' or 'test'. 
    Returns only train if test N/A
    
    Inputs: String, schema defaults to false
    and will infer from input .csv else will apply
    specified schema/schemas
    
    Returns: Dataframes/Dataframe
    
    """
    inferSchema = True if schema==None else False
    schema_test = schema_train if schema_test==None else schema_test
    
    if os.path.exists(directory):
        train = directory+"/train.csv"
        if os.path.exists(train):
            df_train = sql.read.csv(train, 
                         header = True,
                         inferSchema = inferSchema,
                         schema=schema)
        else:
            raise ValueError("train.csv not found in %s" % directory)
        
        test = directory+"/test.csv"
        if os.path.exists(test):
            df_test = sql.read.csv(test, 
                         header = True,
                         inferSchema = inferSchema,
                         schema=schema_test)
            
            return df_train,df_test
        
        return df_train
        
    else:
        raise ValueError("%s does not exist" % directory)   

In [538]:
#specify schema
schema= StructType([
    StructField("PassengerId",IntegerType(),True),
    StructField("Survived",StringType(),True),
    StructField("Pclass",StringType(),True),
    StructField("Name",StringType(),True),
    StructField("Sex",StringType(),True),
    StructField("Age",DoubleType(),True),
    StructField("SibSp",DoubleType(),True),
    StructField("Parch",DoubleType(),True),
    StructField("Ticket",StringType(),True),
    StructField("Fare",DoubleType(),True),
    StructField("Cabin",StringType(),True),
    StructField("Embarked",StringType(),True)])

schema_test= StructType([
    StructField("PassengerId",IntegerType(),True),
    StructField("Pclass",StringType(),True),
    StructField("Name",StringType(),True),
    StructField("Sex",StringType(),True),
    StructField("Age",DoubleType(),True),
    StructField("SibSp",DoubleType(),True),
    StructField("Parch",DoubleType(),True),
    StructField("Ticket",StringType(),True),
    StructField("Fare",DoubleType(),True),
    StructField("Cabin",StringType(),True),
    StructField("Embarked",StringType(),True)])



df_train,df_test= create_dataframes('./data',schema_train=schema,schema_test=schema_test)

In [539]:
print(df_train.schema.fields)

print("'''''''''''''''''''")
print(df_test.schema.fields)

[StructField(PassengerId,IntegerType,true), StructField(Survived,StringType,true), StructField(Pclass,StringType,true), StructField(Name,StringType,true), StructField(Sex,StringType,true), StructField(Age,DoubleType,true), StructField(SibSp,DoubleType,true), StructField(Parch,DoubleType,true), StructField(Ticket,StringType,true), StructField(Fare,DoubleType,true), StructField(Cabin,StringType,true), StructField(Embarked,StringType,true)]
'''''''''''''''''''
[StructField(PassengerId,IntegerType,true), StructField(Pclass,StringType,true), StructField(Name,StringType,true), StructField(Sex,StringType,true), StructField(Age,DoubleType,true), StructField(SibSp,DoubleType,true), StructField(Parch,DoubleType,true), StructField(Ticket,StringType,true), StructField(Fare,DoubleType,true), StructField(Cabin,StringType,true), StructField(Embarked,StringType,true)]


In [540]:
df_train.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|  1.0|  0.0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|  1.0|  0.0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|  0.0|  0.0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|  1.0|  0.0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|  0.0|  0.0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|  0.0|  0.0|      

In [521]:
# combine train and test
df_train = df_train.withColumn('Mark',lit('train'))
df_test  = df_test.withColumn('Mark',lit('test'))
#drop survived to append train/test for cleaning
df = df_train.drop('Survived')
df = df.unionAll(df_test)

AnalysisException: "Union can only be performed on tables with the same number of columns, but the first table has 12 columns and the second table has 13 columns;;\n'Union\n:- Project [PassengerId#95275, Pclass#95277, Name#95278, Sex#95279, Age#95280, SibSp#95281, Parch#95282, Ticket#95283, Fare#95284, Cabin#95285, Embarked#95286, Mark#95325]\n:  +- Project [PassengerId#95275, Survived#95276, Pclass#95277, Name#95278, Sex#95279, Age#95280, SibSp#95281, Parch#95282, Ticket#95283, Fare#95284, Cabin#95285, Embarked#95286, train AS Mark#95325]\n:     +- Relation[PassengerId#95275,Survived#95276,Pclass#95277,Name#95278,Sex#95279,Age#95280,SibSp#95281,Parch#95282,Ticket#95283,Fare#95284,Cabin#95285,Embarked#95286] csv\n+- Project [PassengerId#95300, Survived#95301, Pclass#95302, Name#95303, Sex#95304, Age#95305, SibSp#95306, Parch#95307, Ticket#95308, Fare#95309, Cabin#95310, Embarked#95311, test AS Mark#95340]\n   +- Relation[PassengerId#95300,Survived#95301,Pclass#95302,Name#95303,Sex#95304,Age#95305,SibSp#95306,Parch#95307,Ticket#95308,Fare#95309,Cabin#95310,Embarked#95311] csv\n"

In [511]:
df_train.schema.fields

[StructField(PassengerId,IntegerType,true),
 StructField(Survived,IntegerType,true),
 StructField(Pclass,IntegerType,true),
 StructField(Name,StringType,true),
 StructField(Sex,StringType,true),
 StructField(Age,DoubleType,true),
 StructField(SibSp,IntegerType,true),
 StructField(Parch,IntegerType,true),
 StructField(Ticket,StringType,true),
 StructField(Fare,DoubleType,true),
 StructField(Cabin,StringType,true),
 StructField(Embarked,StringType,true),
 StructField(Mark,StringType,false)]

In [447]:
#missing values by column
for column in df.columns:
    missing = df.where(df[column].isNull()).count()
    print("Missing values for %s : %s" % (column,missing))

Missing values for PassengerId : 0
Missing values for Pclass : 0
Missing values for Name : 0
Missing values for Sex : 0
Missing values for Age : 263
Missing values for SibSp : 0
Missing values for Parch : 0
Missing values for Ticket : 0
Missing values for Fare : 1
Missing values for Cabin : 1014
Missing values for Embarked : 2
Missing values for Mark : 0


In [448]:
#cabin has a high amount of missing values so I will remove it 
df = df.drop('Cabin')

In [449]:
#fill missing values with the mean
def fill_null_with_mean(df):
    """
    Replaces null numeric values with
    mean value
    Replaces categorical string values
    with mode
    input: spark dataframe
    returns: spark dataframe
    
    """
    
    x = df.cache()
    
    for column in df.schema.fields:
        dtype = "%s" % column.dataType
        if dtype != "StringType":
            mean = df.groupBy().mean(column.name).first()[0]
            x = x.na.fill({column.name:mean})
        else:
            counts = df.groupBy(column.name).count()
            mode = counts.join(
            counts.agg(F.max("count").alias("max_")),
            col("count") == col("max_")
            ).limit(1).select(column.name)
            x = x.na.fill({column.name:mode.first()[0]})     
    return x

df = fill_null_with_mean(df)

The cleaning method above could be much improved to replace missing values than with the mean but for this notebook I wanted something quick

In [450]:
#remove spaces
spaceDeleteUDF = F.udf(lambda s: s.replace(" ", ""),StringType())
df=df.withColumn('Name',spaceDeleteUDF(df["Name"]))

In [451]:
#Title cleanse 
df = df.withColumn('Surname',split('Name',',')[0])
df = df.withColumn('name_split',F.trim(split('Name',',')[1]))
df = df.withColumn('Title',split('name_split','\\.')[0])
title_dictionary = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Sir",
    "Don":        "Sir",
    "Sir" :       "Sir",
    "Dr":         "Mr",
    "Rev":        "Mr",
    "theCountess":"Lady",
    "Dona":       "Lady",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Lady"
}

#x = df['Title'].map(Title_Dictionary)
mapping_expr = create_map([lit(x) for x in chain(*title_dictionary.items())])

df = df.withColumn("Title", mapping_expr.getItem(col("Title")))

In [452]:
# create binary column 'Mother'
df = df.withColumn('Mother',when((df['Sex'] =='female')&
                                (df['Age'] > 18)&
                                (df['Parch'] > 0)
                                 ,'True').otherwise('False'))

#create a family size column
df = df.withColumn('Family_size',(df['SibSp'] + df['Parch'] + 1))

# create a family id column
df = df.withColumn('Family_id',when(df['Family_size']>2,
                                   (concat(df['Surname'],
                                    df['Family_size']))).otherwise('None'))

In [454]:
for column in ['Title','Mother','Family_size','Family_id','Surname']:
    missing = df.where(df[column].isNull()).count()
    print("Missing values for %s : %s" % (column,missing))

Missing values for Title : 0
Missing values for Mother : 0
Missing values for Family_size : 0
Missing values for Family_id : 0
Missing values for Surname : 0


In [455]:
#drop columns 
df = df.drop('PassengerId','Ticket','Surname','Name','name_split')

In [471]:
#split back into train and test 
train = df.where(df['Mark']=='train')
test  = df.where(df['Mark']=='test')

#append 'Survived' back on training data
# since there is no common column between these two dataframes add row_index so that it can be joined
train=train.withColumn('row_index', F.monotonically_increasing_id())
survived = df_train.select('Survived')
survived = survived.withColumn('row_index', F.monotonically_increasing_id())
train = train.join(survived, on=["row_index"]).sort("row_index").drop("row_index")

In [481]:
def split_on_column_types(df):
    """
    Create array of numeric and string
    
    """
    
    string = []
    numeric = []
    
    for col in df.schema.fields:
        x = "%s" % col.dataType
        if x == "StringType":
            string.append(col.name)
        else:
            numeric.append(col.name)
            
            
    return string,numeric

categorical,numeric = split_on_column_types(train)
numeric.remove('Pclass')
categorical.remove('Mark')
categorical.extend(['Pclass','Survived'])
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in categorical]
encoders = [OneHotEncoder(inputCol=column+"_index",outputCol=column+"_vec") for column in categorical]

In [482]:
index_categorical = [x + "_index" for x in categorical]
all_columns = index_categorical + numeric
assembler = VectorAssembler(inputCols=all_columns,outputCol='features')

In [483]:
rf = RandomForestClassifier(labelCol="Survived_index",
                            featuresCol="features",
                            numTrees=100,
                            maxBins=97
                           )

In [484]:
labelConverter = IndexToString(inputCol='prediction',
                               outputCol='predictedLabel')

In [485]:
stages = indexers + [assembler,rf]
pipeline = Pipeline(stages = stages)
df_indexed = pipeline.fit(train).transform(train)
df_indexed.schema.fields

[StructField(Pclass,IntegerType,true),
 StructField(Sex,StringType,false),
 StructField(Age,DoubleType,false),
 StructField(SibSp,IntegerType,true),
 StructField(Parch,IntegerType,true),
 StructField(Fare,DoubleType,false),
 StructField(Embarked,StringType,false),
 StructField(Mark,StringType,false),
 StructField(Title,StringType,true),
 StructField(Mother,StringType,false),
 StructField(Family_size,IntegerType,true),
 StructField(Family_id,StringType,true),
 StructField(Survived,IntegerType,true),
 StructField(Sex_index,DoubleType,true),
 StructField(Embarked_index,DoubleType,true),
 StructField(Title_index,DoubleType,true),
 StructField(Mother_index,DoubleType,true),
 StructField(Family_id_index,DoubleType,true),
 StructField(Pclass_index,DoubleType,true),
 StructField(Survived_index,DoubleType,true),
 StructField(features,VectorUDT,true),
 StructField(rawPrediction,VectorUDT,true),
 StructField(probability,VectorUDT,true),
 StructField(prediction,DoubleType,true)]

In [40]:
# #scale numeric columns
# from pyspark.ml.feature import StandardScaler
# scalers = [StandardScaler(inputCol=column, outputCol=column+"_index"
#                          ,withStd=False,withMean=False
#                          ).fit(df) for column in numeric]


In [541]:
sc.stop()