# Logistic Regression - Pipeline

  <strong>Requirement:</strong>
  - Read data
  - Pre-process data
  - With some information: 'Survived', 'Pclass', 'Sex', 'Age', 'Sibsp', 'Parch', 'Fare', 'Embarked' => build a model (use Pipeline) to predict if a passenger on Titanic 'Survived' or not
  - Estimate this model

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('myproj').getOrCreate()

In [None]:
data = spark.read.csv('titanic.csv', inferSchema=True, header=True)

In [None]:
data.count()

891

In [None]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [None]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [None]:
my_cols = data.select(['Survived',
                       'Pclass',
                       'Sex',
                       'Age', 
                       'SibSp',
                       'Parch',
                       'Fare',
                       'Embarked'])

In [None]:
my_final_data = my_cols.na.drop()

### Working with Categorical Columns
Let's break this down into multiple steps to make it all clear.

In [None]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                                OneHotEncoder, StringIndexer)

In [None]:
gender_indexer = StringIndexer(inputCol='Sex', 
                               outputCol='SexIndex')
gender_encoder =OneHotEncoder(inputCol='SexIndex', 
                               outputCol='SexVec')

In [None]:
embarked_indexer = StringIndexer(inputCol='Embarked', 
                               outputCol='EmbarkedIndex')
embarked_encoder = OneHotEncoder(inputCol='EmbarkedIndex', 
                               outputCol='EmbarkedVec')

In [None]:
assembler = VectorAssembler(inputCols=['Pclass',
                                       'SexVec',
                                       'Age',
                                       'SibSp',
                                       'Parch', 
                                       'Fare', 
                                       'EmbarkedVec'], outputCol='features')

In [None]:
from pyspark.ml.classification import LogisticRegression



```
# This is formatted as code
```

### Pipeline
Let's see an example of how to use pipelines (we'll get a lot more preactice with these later!)

In [None]:
from pyspark.ml import Pipeline

In [None]:
log_reg_titanic = LogisticRegression(featuresCol='features',
                                     labelCol='Survived')

In [None]:
pipeline = Pipeline(stages=[gender_indexer, embarked_indexer,
                            gender_encoder, embarked_encoder,
                            assembler, log_reg_titanic])

In [None]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7, 0.3])

In [None]:
fit_model = pipeline.fit(train_titanic_data)

In [None]:
results = fit_model.transform(test_titanic_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                        labelCol='Survived')

In [None]:
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [None]:
AUC = my_eval.evaluate(results)

In [None]:
AUC

0.7850388382303276