In [1]:
import findspark
findspark.init("/Users/jean/spark-2.4.4-bin-hadoop2.7")
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('titanic').getOrCreate()

In [2]:
df = spark.read.csv("titanic.csv", inferSchema=True, header=True)

In [3]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [5]:
my_cols = df.select(['Survived','Pclass','Age','SibSp','Parch','Fare', 'Embarked', 'Sex'])

In [6]:
final_data = my_cols.na.drop()

In [7]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [8]:
gendex_indexer = StringIndexer(inputCol="Sex", outputCol="Sex_index")
gender_enconder = OneHotEncoder(inputCol="Sex_index", outputCol="Sex_vect")

In [9]:
embark_indexer = StringIndexer(inputCol="Embarked", outputCol="Embarked_index")
embark_enconder = OneHotEncoder(inputCol="Embarked_index", outputCol="Embarked_vect")

In [10]:
assembler = (VectorAssembler(inputCols=['Pclass','Age','SibSp','Parch','Fare', 'Embarked_vect', "Sex_vect"],
                             outputCol="features"))

In [11]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline

In [12]:
log = LogisticRegression(featuresCol="features", labelCol="Survived")

In [13]:
pipeline = (Pipeline(stages=[gendex_indexer, embark_indexer,
                             gender_enconder, embark_enconder,
                             assembler, log]))

In [14]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [15]:
fit_model = pipeline.fit(train_data)

In [16]:
results = fit_model.transform(test_data)

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [18]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="Survived")

In [19]:
my_eval.evaluate(results)

0.7911585365853658

In [20]:
train_data.columns

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Sex']

In [21]:
test_data.columns

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Sex']