### Dataset 'titanic.csv'
### Requirement:
* Read data
* Pre-process data.
* With some informations: 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked' => build a model (use Pipeline) to predict if a passenger on Titanic 'Survived' or not
* Estimate this model

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.functions import corr

from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
spark = SparkSession.builder.appName('titanic').getOrCreate()

In [None]:
# Load data
df = spark.read.csv("../../Data/titanic.csv", header=False, inferSchema=True)

In [None]:
df.count()

In [None]:
df.printSchema()

In [None]:
data.columns

In [None]:
my_cols = data.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [None]:
my_final_data = my_cols.na.drop()

### Working with Categorical Columns

In [None]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [None]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [None]:
assembler = VectorAssembler(inputCols=['Survived', 'Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkVec'], outputCol='features')

### Pipelines

In [None]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

In [None]:
log_reg_titanic = LogisticRegression(featureCol='features', labelCol='Survived')

In [None]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titanic])

In [None]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7, 0.3])

In [None]:
fit_model = pipeline.fit(train_titanic_data)

In [None]:
results = fit_model.transform(test_titanic_data)

In [None]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [None]:
result.select('Survived', 'prediction').show()

In [None]:
AUC = my_eval.evaluate(result)
AUC