In [288]:
import pyspark
import findspark
import time
import os.path
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, OneHotEncoder, SQLTransformer
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import translate,lit,col,isnan,count,when,split,explode,ltrim

In [212]:
#initialise spark
findspark.init()
sc = pyspark.SparkContext(appName='Classifier')
sql = pyspark.SQLContext(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Classifier, master=local[*]) created by __init__ at <ipython-input-158-8f4850d1539f>:3 

In [213]:
def create_dataframes(directory):
    """
    Creates dataframes from directory
    Must be named 'train' or 'test'. 
    Returns only train if test N/A
    
    Inputs: String 
    
    Returns: Dataframes/Dataframe
    
    """
    if os.path.exists(directory):
        train = directory+"/train.csv"
        if os.path.exists(train):
            df_train = sql.read.csv(train, 
                         header = True,
                         inferSchema = True)
        else:
            raise ValueError("train.csv not found in %s" % directory)
        
        test = directory+"/test.csv"
        if os.path.exists(test):
            df_test = sql.read.csv(train, 
                         header = True,
                         inferSchema = True)
            return df_train,df_test
        
        return df_train
        
    else:
        raise ValueError("%s does not exist" % directory)   

In [214]:
df_train,df_test= create_dataframes('./data')

In [215]:
# combine train and test
df_train = df_train.withColumn('Mark',lit('train'))
df_test  = df_test.withColumn('Mark',lit('test'))
df = df_train.unionAll(df_test)

In [216]:
#missing values by column
for col in df.columns:
    missing = df.where(df[col].isNull()).count()
    print("Missing values for %s : %s" % (col,missing))

Missing values for PassengerId : 0
Missing values for Survived : 0
Missing values for Pclass : 0
Missing values for Name : 0
Missing values for Sex : 0
Missing values for Age : 354
Missing values for SibSp : 0
Missing values for Parch : 0
Missing values for Ticket : 0
Missing values for Fare : 0
Missing values for Cabin : 1374
Missing values for Embarked : 4
Missing values for Mark : 0


In [217]:
#cabin has a high amount of missing values so I will remove it 
df = df.drop('Cabin')

In [218]:
#fill missing values with the mean
def fill_null_with_mean(df):
    """
    Replaces null numeric values with
    mean value
    
    input: spark dataframe
    returns: spark dataframe
    
    """
    
    x = df.cache()
    
    for col in df.schema.fields:
        dtype = "%s" % col.dataType
        if dtype != "StringType":
            mean = df.groupBy().mean(col.name).first()[0]
            x = x.na.fill({col.name:mean})
            
    return x

df = fill_null_with_mean(df)

In [219]:
#replace categorical with mode

In [289]:
#Title cleanse 
df = df.withColumn('name_split',split('Name',', ')[1])
df= df.withColumn('Title',ltrim(split('name_split','. ')[0]))
df.drop('name_split')
Title_Dictionary = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Sir",
    "Don":        "Sir",
    "Sir" :       "Sir",
    "Dr":         "Mr",
    "Rev":        "Mr",
    "the Countess":"Lady",
    "Dona":       "Lady",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Lady"
}

#x = df['Title'].map(Title_Dictionary)
df.withColumn('test',translate(Title_Dictionary)('Title'))

TypeError: translate() missing 2 required positional arguments: 'matching' and 'replace'

In [283]:
x.select('Title').show()

+------+
| Title|
+------+
|    Mr|
|   Mrs|
|  Miss|
|   Mrs|
|    Mr|
|    Mr|
|    Mr|
|Master|
|   Mrs|
|   Mrs|
|  Miss|
|  Miss|
|    Mr|
|    Mr|
|  Miss|
|   Mrs|
|Master|
|    Mr|
|   Mrs|
|   Mrs|
+------+
only showing top 20 rows



In [279]:
x.show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+--------+-----+--------------------+------+
|PassengerId|Survived|Pclass|                Name|   Sex|              Age|SibSp|Parch|          Ticket|   Fare|Embarked| Mark|          name_split| Title|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+--------+-----+--------------------+------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|             22.0|    1|    0|       A/5 21171|   7.25|       S|train|     Mr. Owen Harris|    Mr|
|          2|       1|     1|Cumings, Mrs. Joh...|female|             38.0|    1|    0|        PC 17599|71.2833|       C|train|Mrs. John Bradley...|   Mrs|
|          3|       1|     3|Heikkinen, Miss. ...|female|             26.0|    0|    0|STON/O2. 3101282|  7.925|       S|train|         Miss. Laina|  Miss|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|       

In [156]:
sc.stop()