In [1]:
import pandas as pd

In [2]:
sc

# Predicting infant survival

Read the csv file:

In [3]:
finalBirthsPath = './data/births_final.csv'

In [4]:
births_final = spark.read.csv(finalBirthsPath, sep = ',', header = True, inferSchema = True)

In [5]:
births_final.show(5)

+----------------------+----------------+-------------------+---------+----------------+-----------------+------------+-------------+------------+-------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|INFANT_ALIVE_AT_REPORT|MOTHER_AGE_YEARS|FATHER_COMBINED_AGE|CIG_1_TRI|MOTHER_HEIGHT_IN|MOTHER_PRE_WEIGHT|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|BIRTH_PLACE_6|BIRTH_PLACE_3|BIRTH_PLACE_5|BIRTH_PLACE_9|BIRTH_PLACE_4|BIRTH_PLACE_7|BIRTH_PLACE_2|
+----------------------+----------------+-------------------+---------+----------------+-----------------+------------+-------------+------------+-------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|                     0|              29|                 99|        0|              99|              999|           0|            0|           0|            0|      

In [6]:
births_final.printSchema()

root
 |-- INFANT_ALIVE_AT_REPORT: integer (nullable = true)
 |-- MOTHER_AGE_YEARS: integer (nullable = true)
 |-- FATHER_COMBINED_AGE: integer (nullable = true)
 |-- CIG_1_TRI: integer (nullable = true)
 |-- MOTHER_HEIGHT_IN: integer (nullable = true)
 |-- MOTHER_PRE_WEIGHT: integer (nullable = true)
 |-- DIABETES_PRE: integer (nullable = true)
 |-- DIABETES_GEST: integer (nullable = true)
 |-- HYP_TENS_PRE: integer (nullable = true)
 |-- HYP_TENS_GEST: integer (nullable = true)
 |-- PREV_BIRTH_PRETERM: integer (nullable = true)
 |-- BIRTH_PLACE_6: integer (nullable = true)
 |-- BIRTH_PLACE_3: integer (nullable = true)
 |-- BIRTH_PLACE_5: integer (nullable = true)
 |-- BIRTH_PLACE_9: integer (nullable = true)
 |-- BIRTH_PLACE_4: integer (nullable = true)
 |-- BIRTH_PLACE_7: integer (nullable = true)
 |-- BIRTH_PLACE_2: integer (nullable = true)



## Create Dense Vector 

In [7]:
import pyspark.mllib.linalg as ln
import pyspark.mllib.feature as ft
import pyspark.mllib.regression as reg

In [8]:
# Define the `input_data` 
births_final = births_final.rdd.map(lambda x: reg.LabeledPoint(x[0], ln.Vectors.dense(x[1:])))

## Splitting into training and test sets


- Training set: 70%;
- Test set: 30%.

In [9]:
births_train, births_test = births_final.randomSplit([0.7, 0.3], seed = 42)

## Predicting Infant Survival


 In this section, we will build two models: a linear classifier — the logistic regression, and a non-linear one— a random forest.

- Logistic regression:

In [10]:
from pyspark.mllib.classification\
    import LogisticRegressionWithLBFGS

In [11]:
LR_Model = LogisticRegressionWithLBFGS \
 .train(births_train, iterations=20)

Having trained the model using the births_train dataset, let's use the model to
predict the classes for our testing set:

In [12]:
LR_results = (
 births_test.map(lambda row: row.label) \
 .zip(LR_Model \
 .predict(births_test\
 .map(lambda row: row.features)))
 ).map(lambda row: (row[0], row[1] * 1.0))

The preceding snippet creates an RDD where each element is a tuple, with the first
element being the actual label and the second one, the model's prediction.

MLlib provides an evaluation metric for classification and regression. Let's check
how well or how bad our model performed:

In [13]:
LR_results.take(5)

[(0.0, 1.0), (0.0, 0.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0)]

In [14]:
import pyspark.mllib.evaluation as ev

In [15]:
LR_evaluation = ev.BinaryClassificationMetrics(LR_results)

In [16]:
print('Area under PR: {0:.2f}'\
      .format(LR_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}'\
     .format(LR_evaluation.areaUnderROC))

Area under PR: 0.79
Area under ROC: 0.62


Other evaluation metrics (After creating a Pandas DataFrame):

In [17]:
y_results = pd.DataFrame(LR_results.collect(),\
             columns=['y_test', 'y_pred'])

In [18]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

Accuracy, F1-score, Classification report and Confusion matrix:

In [19]:
print('Accuracy score: {}\nF1 score: {}\n\n'\
      .format(accuracy_score(y_results['y_test'], y_results['y_pred']),
             f1_score(y_results['y_test'], y_results['y_pred'])))
print(classification_report(y_results['y_test'],\
                            y_results['y_pred']))
print('Confusion matrix:')
print(confusion_matrix(y_results['y_test'],\
                            y_results['y_pred']))

Accuracy score: 0.6091277143908723
F1 score: 0.6797732481003499


              precision    recall  f1-score   support

         0.0       0.67      0.40      0.50      6635
         1.0       0.59      0.81      0.68      6950

   micro avg       0.61      0.61      0.61     13585
   macro avg       0.63      0.60      0.59     13585
weighted avg       0.63      0.61      0.59     13585

Confusion matrix:
[[2639 3996]
 [1314 5636]]


- Random Forest:

In [20]:
from pyspark.mllib.tree import RandomForest

In [27]:
RF_model = RandomForest\
    .trainClassifier(data=births_train,
                    numClasses=2,
                    categoricalFeaturesInfo={},
                    numTrees=5,
                    featureSubsetStrategy='all',
                    seed=42)

In [28]:
RF_results = (
    births_test.map(lambda row: row.label) \
    .zip(RF_model \
    .predict(births_test\
    .map(lambda row: row.features)))
    ).map(lambda row: (row[0], row[1] * 1.0))

In [29]:
RF_evaluation = ev.BinaryClassificationMetrics(RF_results)

In [30]:
print('Area under PR: {0:.2f}' \
 .format(RF_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}' \
 .format(RF_evaluation.areaUnderROC))

Area under PR: 0.77
Area under ROC: 0.61


Other metrics (<i>sklearn</i>):

In [31]:
y_results_rf = pd.DataFrame(RF_results.collect(),\
             columns=['y_test', 'y_pred'])

In [32]:
print('Accuracy score: {}\nF1 score: {}\n\n'\
      .format(accuracy_score(y_results_rf['y_test'], y_results_rf['y_pred']),
             f1_score(y_results_rf['y_test'], y_results_rf['y_pred'])))
print(classification_report(y_results_rf['y_test'],\
                            y_results_rf['y_pred']))
print('Confusion matrix:')
print(confusion_matrix(y_results_rf['y_test'],\
                            y_results_rf['y_pred']))

Accuracy score: 0.5964666912035334
F1 score: 0.6574606348412897


              precision    recall  f1-score   support

         0.0       0.67      0.41      0.51      6929
         1.0       0.56      0.79      0.66      6656

   micro avg       0.60      0.60      0.60     13585
   macro avg       0.62      0.60      0.58     13585
weighted avg       0.62      0.60      0.58     13585

Confusion matrix:
[[2842 4087]
 [1395 5261]]
