In [115]:
import pyspark
import findspark
import time
import os.path
from itertools import chain
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, OneHotEncoder, SQLTransformer
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext, Row
from pyspark.sql import functions as F
from pyspark.sql.functions import concat,translate,lit,col,isnan,count,when,split,explode,ltrim,create_map

In [116]:
#initialise spark
findspark.init()
sc = pyspark.SparkContext(appName='Classifier')
sql = pyspark.SQLContext(sc)

In [117]:
def create_dataframes(directory):
    """
    Creates dataframes from directory
    Must be named 'train' or 'test'. 
    Returns only train if test N/A
    
    Inputs: String 
    
    Returns: Dataframes/Dataframe
    
    """
    if os.path.exists(directory):
        train = directory+"/train.csv"
        if os.path.exists(train):
            df_train = sql.read.csv(train, 
                         header = True,
                         inferSchema = True)
        else:
            raise ValueError("train.csv not found in %s" % directory)
        
        test = directory+"/test.csv"
        if os.path.exists(test):
            df_test = sql.read.csv(train, 
                         header = True,
                         inferSchema = True)
            return df_train,df_test
        
        return df_train
        
    else:
        raise ValueError("%s does not exist" % directory)   

In [118]:
df_train,df_test= create_dataframes('./data')

In [119]:
# combine train and test
df_train = df_train.withColumn('Mark',lit('train'))
df_test  = df_test.withColumn('Mark',lit('test'))
df = df_train.unionAll(df_test)

In [127]:
#missing values by column
for column in df.columns:
    missing = df.where(df[column].isNull()).count()
    print("Missing values for %s : %s" % (column,missing))

Missing values for PassengerId : 0
Missing values for Survived : 0
Missing values for Pclass : 0
Missing values for Name : 0
Missing values for Sex : 0
Missing values for Age : 0
Missing values for SibSp : 0
Missing values for Parch : 0
Missing values for Ticket : 0
Missing values for Fare : 0
Missing values for Embarked : 0
Missing values for Mark : 0


In [121]:
#cabin has a high amount of missing values so I will remove it 
df = df.drop('Cabin')

In [128]:
#fill missing values with the mean
def fill_null_with_mean(df):
    """
    Replaces null numeric values with
    mean value
    Replaces categorical string values
    with mode
    input: spark dataframe
    returns: spark dataframe
    
    """
    
    x = df.cache()
    
    for column in df.schema.fields:
        dtype = "%s" % column.dataType
        if dtype != "StringType":
            mean = df.groupBy().mean(column.name).first()[0]
            x = x.na.fill({column.name:mean})
        else:
            counts = df.groupBy(column.name).count()
            mode = counts.join(
            counts.agg(F.max("count").alias("max_")),
            col("count") == col("max_")
            ).limit(1).select(column.name)
            x = x.na.fill({column.name:mode.first()[0]})     
    return x

df = fill_null_with_mean(df)

In [129]:
#Title cleanse 
df = df.withColumn('Surname',split('Name',', ')[0])
df = df.withColumn('name_split',split('Name',', ')[1])
df = df.withColumn('Title',ltrim(split('name_split','. ')[0]))
df = df.drop('name_split')
title_dictionary = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Sir",
    "Don":        "Sir",
    "Sir" :       "Sir",
    "Dr":         "Mr",
    "Rev":        "Mr",
    "the Countess":"Lady",
    "Dona":       "Lady",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Lady"
}

#x = df['Title'].map(Title_Dictionary)
mapping_expr = create_map([lit(x) for x in chain(*title_dictionary.items())])

df = df.withColumn("Title", mapping_expr.getItem(col("Title")))

In [130]:
# create binary column 'Mother'
df = df.withColumn('Mother',when((df['Sex'] =='female')&
                                (df['Age'] > 18)&
                                (df['Parch'] > 0)
                                 ,'Mother').otherwise('null'))

#create a family size column
df = df.withColumn('Family_size',(df['SibSp'] + df['Parch'] + 1))

# create a family id column
df = df.withColumn('Family_id',when(df['Family_size']>2,
                                   (concat(df['Surname'],
                                    df['Family_size']))).otherwise('null'))

In [131]:
df.select('Title','Mother','Family_size','Family_id').show(10)

+------+------+-----------+---------+
| Title|Mother|Family_size|Family_id|
+------+------+-----------+---------+
|    Mr|  null|          2|     null|
|   Mrs|  null|          2|     null|
|  Miss|  null|          1|     null|
|   Mrs|  null|          2|     null|
|    Mr|  null|          1|     null|
|    Mr|  null|          1|     null|
|    Mr|  null|          1|     null|
|Master|  null|          5| Palsson5|
|   Mrs|Mother|          3| Johnson3|
|   Mrs|  null|          2|     null|
+------+------+-----------+---------+
only showing top 10 rows



In [133]:
#drop columns 
df = df.drop('PassengerId','Ticket','Surname')

In [135]:
print(df.columns)

['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Mark', 'Title', 'Mother', 'Family_size', 'Family_id']


In [195]:
def split_on_column_types(df):
    """
    Create array of numeric and string
    
    """
    
    string = []
    numeric = []
    
    for col in df.schema.fields:
        x = "%s" % col.dataType
        if x == "StringType":
            string.append(col.name)
        else:
            numeric.append(col.name)
            
            
    return string,numeric

categorical,numeric = split_on_column_types(df)
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in categorical]
encoders = [OneHotEncoder(inputCol=column+"_index",outputCol=column+"_vec") for column in categorical]

In [200]:
indexers.extend(encoders)
print(indexers)

[StringIndexer_414aa69545b37ad78400, StringIndexer_4c0094e49443a1d9fe42, StringIndexer_46c28b2da26c0ba06928, StringIndexer_4295adafd12a1dcf5516, StringIndexer_414f9e1760e230db70f8, StringIndexer_4fd8b8d9566eeb3a9faf, StringIndexer_45e3bc2a297e0a3726ab, OneHotEncoder_4cf18fc8f3eb1288e92e, OneHotEncoder_4f21a1db5489811826fa, OneHotEncoder_461ba57f94e3fd7a0cbf, OneHotEncoder_40a1b9c5161d8d1dc48f, OneHotEncoder_4eb9858eafe645703a53, OneHotEncoder_409e98159d523315de7e, OneHotEncoder_42dab6cb7adefbf0f5fc]


In [201]:
#scale numeric columns
from pyspark.ml.feature import StandardScaler
scalers = [StandardScaler(inputCol=column, outputCol=column+"_index"
                         ,withStd=False,withMean=False
                         ).fit(df) for column in numeric]


IllegalArgumentException: 'requirement failed: Column Survived must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually IntegerType.'

In [193]:
df.show(2)

+--------+------+--------------------+------+----+-----+-----+-------+--------+-----+-----+------+-----------+---------+
|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked| Mark|Title|Mother|Family_size|Family_id|
+--------+------+--------------------+------+----+-----+-----+-------+--------+-----+-----+------+-----------+---------+
|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|train|   Mr|  null|          2|     null|
|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|train|  Mrs|  null|          2|     null|
+--------+------+--------------------+------+----+-----+-----+-------+--------+-----+-----+------+-----------+---------+
only showing top 2 rows



In [202]:
pipeline = Pipeline(stages = indexers)
df_indexed = pipeline.fit(df).transform(df)
from functools import reduce
from pyspark.sql import DataFrame
df_vec=reduce(DataFrame.drop,categorical,df_indexed)

In [203]:
df_vec.show(2)

+--------+------+----+-----+-----+-------+-----------+----------+---------+--------------+----------+-----------+------------+---------------+-----------------+-------------+-------------+-------------+-------------+-------------+--------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Family_size|Name_index|Sex_index|Embarked_index|Mark_index|Title_index|Mother_index|Family_id_index|         Name_vec|      Sex_vec| Embarked_vec|     Mark_vec|    Title_vec|   Mother_vec| Family_id_vec|
+--------+------+----+-----+-----+-------+-----------+----------+---------+--------------+----------+-----------+------------+---------------+-----------------+-------------+-------------+-------------+-------------+-------------+--------------+
|       0|     3|22.0|    1|    0|   7.25|          2|     329.0|      0.0|           0.0|       0.0|        0.0|         0.0|            0.0|(890,[329],[1.0])|(1,[0],[1.0])|(2,[0],[1.0])|(1,[0],[1.0])|(6,[0],[1.0])|(1,[0],[1.0])|(95,[0],[1.0])|
|       1|     1

In [114]:
sc.stop()