In [1]:
sc

# Introducing MLlib

Even though the package is currently in a maintenance mode and is not actively
being worked on, it is still good to know how to use it. Also, for now it is the only
package available to train models while streaming data.

The MLlib package in Spark operates strictly on RDDs.

In [2]:
import pyspark.sql.types as typ
import pyspark.sql.functions as fn

In [3]:
birthFilePath = './data/births_train.csv.gz'

In [4]:
labels = [
    ('INFANT_ALIVE_AT_REPORT', typ.StringType()),
    ('BIRTH_YEAR', typ.IntegerType()),
    ('BIRTH_MONTH', typ.IntegerType()),
    ('BIRTH_PLACE', typ.StringType()),
    ('MOTHER_AGE_YEARS', typ.IntegerType()),
    ('MOTHER_RACE_6CODE', typ.StringType()),
    ('MOTHER_EDUCATION', typ.StringType()),
    ('FATHER_COMBINED_AGE', typ.IntegerType()),
    ('FATHER_EDUCATION', typ.StringType()),
    ('MONTH_PRECARE_RECODE', typ.StringType()),
    ('CIG_BEFORE', typ.IntegerType()),
    ('CIG_1_TRI', typ.IntegerType()),
    ('CIG_2_TRI', typ.IntegerType()),
    ('CIG_3_TRI', typ.IntegerType()),
    ('MOTHER_HEIGHT_IN', typ.IntegerType()),
    ('MOTHER_BMI_RECODE', typ.IntegerType()),
    ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
    ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
    ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
    ('DIABETES_PRE', typ.StringType()),
    ('DIABETES_GEST', typ.StringType()),
    ('HYP_TENS_PRE', typ.StringType()),
    ('HYP_TENS_GEST', typ.StringType()),
    ('PREV_BIRTH_PRETERM', typ.StringType()),
    ('NO_RISK', typ.StringType()),
    ('NO_INFECTIONS_REPORTED', typ.StringType()),
    ('LABOR_IND', typ.StringType()),
    ('LABOR_AUGM', typ.StringType()),
    ('STEROIDS', typ.StringType()),
    ('ANTIBIOTICS', typ.StringType()),
    ('ANESTHESIA', typ.StringType()),
    ('DELIV_METHOD_RECODE_COMB', typ.StringType()),
    ('ATTENDANT_BIRTH', typ.StringType()),
    ('APGAR_5', typ.IntegerType()),
    ('APGAR_5_RECODE', typ.StringType()),
    ('APGAR_10', typ.IntegerType()),
    ('APGAR_10_RECODE', typ.StringType()),
    ('INFANT_SEX', typ.StringType()),
    ('OBSTETRIC_GESTATION_WEEKS', typ.IntegerType()),
    ('INFANT_WEIGHT_GRAMS', typ.IntegerType()),
    ('INFANT_ASSIST_VENTI', typ.StringType()),
    ('INFANT_ASSIST_VENTI_6HRS', typ.StringType()),
    ('INFANT_NICU_ADMISSION', typ.StringType()),
    ('INFANT_SURFACANT', typ.StringType()),
    ('INFANT_ANTIBIOTICS', typ.StringType()),
    ('INFANT_SEIZURES', typ.StringType()),
    ('INFANT_NO_ABNORMALITIES', typ.StringType()),
    ('INFANT_ANCEPHALY', typ.StringType()),
    ('INFANT_MENINGOMYELOCELE', typ.StringType()),
    ('INFANT_LIMB_REDUCTION', typ.StringType()),
    ('INFANT_DOWN_SYNDROME', typ.StringType()),
    ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', typ.StringType()),
    ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', typ.StringType()),
    ('INFANT_BREASTFED', typ.StringType())
]

schema = typ.StructType([
 typ.StructField(e[0], e[1], False) for e in labels
 ])

In [5]:
df_birth = spark.read.csv(birthFilePath,
                         schema = schema,
                         header = True)

In [6]:
recode_dict = {
    'YNU': {
        'Y': 1,
        'N': 0,
        'U': 0
    }
}

We will drop all of the features that relate to the infant and will
try to predict the infant's chances of surviving only based on the features related to
its mother, father, and the place of birth:

In [7]:
selected_features = [
 'INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM'
]
births_trimmed = df_birth.select(selected_features)

Number of cigs smoked: 
- 0 means the mother smoked no cigarettes before or during the pregnancy;
- Between 1-97 states the actual number of cigarette smoked;
- 98 indicates either 98 or more.

Whereas 99 identifies the unknown; we will assume the unknown is 0 and recode accordingly.

In [8]:
for col in ['CIG_BEFORE', 'CIG_1_TRI', 'CIG_2_TRI', 'CIG_3_TRI']:
    births_trimmed = births_trimmed.withColumn(col,
                                               fn.when(births_trimmed[col] > 98,
                                               fn.lit(0)).otherwise(births_trimmed[col]))

Now we will focus on correcting the Yes/No/Unknown features. First, we will
figure out which these are with the following snippet:

In [9]:
cols = [(col.name, col.dataType) for col in births_trimmed.schema]
YNU_cols = []

for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        if births_trimmed.select(s[0]).distinct().collect()[0][0] == 'Y' \
        or births_trimmed.select(s[0]).distinct().collect()[0][0] == 'N' \
        or births_trimmed.select(s[0]).distinct().collect()[0][0] == 'U':
            YNU_cols.append(s[0])

In [10]:
YNU_cols

['INFANT_ALIVE_AT_REPORT',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [11]:
for col in YNU_cols:
    births_trimmed = births_trimmed.withColumn(col,
        fn.when(births_trimmed[col] == 'Y',
        fn.lit(1)).otherwise(fn.lit(0)))

In [12]:
births_trimmed.show(5)

+----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+
|INFANT_ALIVE_AT_REPORT|BIRTH_PLACE|MOTHER_AGE_YEARS|FATHER_COMBINED_AGE|CIG_BEFORE|CIG_1_TRI|CIG_2_TRI|CIG_3_TRI|MOTHER_HEIGHT_IN|MOTHER_PRE_WEIGHT|MOTHER_DELIVERY_WEIGHT|MOTHER_WEIGHT_GAIN|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|
+----------------------+-----------+----------------+-------------------+----------+---------+---------+---------+----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+
|                     0|          1|              29|                 99|         0|        0|        0|        0|              99|              999|                   999|                99|           0| 

## Exploratory Data Analysis

Numeric variables:

In [13]:
numeric_cols = ['MOTHER_AGE_YEARS','FATHER_COMBINED_AGE',
 'CIG_BEFORE','CIG_1_TRI','CIG_2_TRI','CIG_3_TRI',
 'MOTHER_HEIGHT_IN','MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT','MOTHER_WEIGHT_GAIN'
 ]

In [14]:
births_trimmed.select(numeric_cols).describe().show()

+-------+------------------+-------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+----------------------+------------------+
|summary|  MOTHER_AGE_YEARS|FATHER_COMBINED_AGE|       CIG_BEFORE|         CIG_1_TRI|        CIG_2_TRI|         CIG_3_TRI|  MOTHER_HEIGHT_IN| MOTHER_PRE_WEIGHT|MOTHER_DELIVERY_WEIGHT|MOTHER_WEIGHT_GAIN|
+-------+------------------+-------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+----------------------+------------------+
|  count|             45429|              45429|            45429|             45429|            45429|             45429|             45429|             45429|                 45429|             45429|
|   mean|28.298421713002707|  44.54975896453807|1.427986528428977|0.9057430275815008|0.702480794206344|0.5800259745977239|   65.120891941271|214.49840410310594|    223.62609786700125| 30.7

Categorical variables:

In [15]:
categorical_cols = [col for col in births_trimmed.columns if (col not in numeric_cols)]

In [16]:
categorical_cols

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [17]:
for col in categorical_cols:
    births_trimmed.cube(col).count().show()

+----------------------+-----+
|INFANT_ALIVE_AT_REPORT|count|
+----------------------+-----+
|                     1|23349|
|                  null|45429|
|                     0|22080|
+----------------------+-----+

+-----------+-----+
|BIRTH_PLACE|count|
+-----------+-----+
|          4|  327|
|          7|   91|
|          6|   11|
|          3|  224|
|       null|45429|
|          9|    8|
|          1|44558|
|          5|   74|
|          2|  136|
+-----------+-----+

+------------+-----+
|DIABETES_PRE|count|
+------------+-----+
|           1|  548|
|        null|45429|
|           0|44881|
+------------+-----+

+-------------+-----+
|DIABETES_GEST|count|
+-------------+-----+
|            1| 1978|
|         null|45429|
|            0|43451|
+-------------+-----+

+------------+-----+
|HYP_TENS_PRE|count|
+------------+-----+
|           1| 1081|
|        null|45429|
|           0|44348|
+------------+-----+

+-------------+-----+
|HYP_TENS_GEST|count|
+-------------+-----+
|   

## Correlations

Correlations help to identify collinear numeric features and handle them
appropriately. 

In [18]:
from pyspark.mllib.stat import Statistics
import pandas as pd

In [19]:
columns = numeric_cols
features = births_trimmed.select(*numeric_cols).rdd.map(lambda row: row[0:])
corr_mat=Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = columns, columns

In [20]:
corr_df

Unnamed: 0,MOTHER_AGE_YEARS,FATHER_COMBINED_AGE,CIG_BEFORE,CIG_1_TRI,CIG_2_TRI,CIG_3_TRI,MOTHER_HEIGHT_IN,MOTHER_PRE_WEIGHT,MOTHER_DELIVERY_WEIGHT,MOTHER_WEIGHT_GAIN
MOTHER_AGE_YEARS,1.0,-0.035203,-0.064101,-0.045254,-0.033569,-0.02732,0.041911,0.02852,0.022333,0.014624
FATHER_COMBINED_AGE,-0.035203,1.0,0.087993,0.094362,0.091438,0.076141,0.086203,0.1279,0.097506,0.035482
CIG_BEFORE,-0.064101,0.087993,1.0,0.825531,0.722135,0.623034,-0.010871,-0.026424,-0.004672,-0.011881
CIG_1_TRI,-0.045254,0.094362,0.825531,1.0,0.865457,0.75992,-0.006381,-0.012328,-0.001128,-0.014818
CIG_2_TRI,-0.033569,0.091438,0.722135,0.865457,1.0,0.893076,-0.002765,-0.006062,0.00146,-0.014359
CIG_3_TRI,-0.02732,0.076141,0.623034,0.75992,0.893076,1.0,-0.000938,-0.003776,0.004836,-0.006379
MOTHER_HEIGHT_IN,0.041911,0.086203,-0.010871,-0.006381,-0.002765,-0.000938,1.0,0.45257,0.474217,0.331764
MOTHER_PRE_WEIGHT,0.02852,0.1279,-0.026424,-0.012328,-0.006062,-0.003776,0.45257,1.0,0.53597,0.649941
MOTHER_DELIVERY_WEIGHT,0.022333,0.097506,-0.004672,-0.001128,0.00146,0.004836,0.474217,0.53597,1.0,0.596929
MOTHER_WEIGHT_GAIN,0.014624,0.035482,-0.011881,-0.014818,-0.014359,-0.006379,0.331764,0.649941,0.596929,1.0


As you can see, the 'CIG_...' features are highly correlated, so we can drop most of
them. Since we want to predict the survival chances of an infant as soon as possible,
we will keep only the 'CIG_1_TRI'. Also, as expected, the weight features are also
highly correlated and we will only keep the 'MOTHER_PRE_WEIGHT':

In [21]:
features_to_keep = [
 'INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_1_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM'
]

In [22]:
births_transformed = births_trimmed.select(*features_to_keep)

Check if all variables are ready:

In [24]:
births_transformed.printSchema()

root
 |-- INFANT_ALIVE_AT_REPORT: integer (nullable = false)
 |-- BIRTH_PLACE: string (nullable = true)
 |-- MOTHER_AGE_YEARS: integer (nullable = true)
 |-- FATHER_COMBINED_AGE: integer (nullable = true)
 |-- CIG_1_TRI: integer (nullable = true)
 |-- MOTHER_HEIGHT_IN: integer (nullable = true)
 |-- MOTHER_PRE_WEIGHT: integer (nullable = true)
 |-- DIABETES_PRE: integer (nullable = false)
 |-- DIABETES_GEST: integer (nullable = false)
 |-- HYP_TENS_PRE: integer (nullable = false)
 |-- HYP_TENS_GEST: integer (nullable = false)
 |-- PREV_BIRTH_PRETERM: integer (nullable = false)



## Creating the final dataset

BIRTH_PLACE is still a string. Since it is not an ordinal variable, we have to create dummy variables referring to its values.

In [26]:
births_transformed = births_transformed.withColumn("BIRTH_PLACE", births_transformed["BIRTH_PLACE"].cast(typ.IntegerType()))

In [32]:
categories_birth = births_transformed.select("BIRTH_PLACE").distinct().rdd.flatMap(lambda x: x).collect()
exprs = [fn.when(fn.col("BIRTH_PLACE") == category, 1).otherwise(0).alias('BIRTH_PLACE_' + str(category))\
         for category in categories_birth]

We exclude one of the dummy variables to avoid the <i>Dummy variable trap</i>.

In [45]:
births_transformed = births_transformed.select('*', *exprs[1:])

Finally, we exclude the BIRTH_PLACE variable:

In [50]:
births_final = births_transformed.select(*[col for col in births_transformed.columns\
                            if col != 'BIRTH_PLACE'])

Create a CSV file with all data to use later:

In [58]:
births_final.write.csv('./data/births_final.csv', header = True)