# Introduction
This notebook builds some models based on the features computed from the `03_feature_engineering` notebook.

# Libraries

In [3]:
import pyspark.sql.functions as f

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler, Normalizer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import numpy as np

# Data

Load the data and labels containing the summary features for training:

In [6]:
train_df = spark.read.parquet('dbfs:/msh/grab/data/features_valid_summary.parquet')
label_df = spark.read.parquet('dbfs:/msh/grab/data/label.parquet')

train_df = train_df.join(label_df, ['bookingID'], 'left')

display(train_df)

bookingID,trip_length,mean_Speed,sd_Speed,min_Speed,max_Speed,mean_mag_acc,sd_mag_acc,min_mag_acc,max_mag_acc,mean_mag_gyro,sd_mag_gyro,min_mag_gyro,max_mag_gyro,Speed_quantiles_0,mag_acc_quantiles_0,mag_gyro_quantiles_0,Speed_quantiles_1,mag_acc_quantiles_1,mag_gyro_quantiles_1,Speed_quantiles_2,mag_acc_quantiles_2,mag_gyro_quantiles_2,Speed_quantiles_3,mag_acc_quantiles_3,mag_gyro_quantiles_3,Speed_quantiles_4,mag_acc_quantiles_4,mag_gyro_quantiles_4,label
1030792151136,1603.0,14.862942765527585,6.797953591405691,0.32110122,25.214682,9.943056988416776,0.9843915474530084,4.308710817491772,16.859894585414715,0.0805733048325839,0.0661263211058812,0.00033848991871423775,0.5972424523416325,4.36959,8.468746595984339,0.0121900158404003,8.907368,9.401708510133178,0.0360547294901825,15.346053,9.949168134488575,0.0652902712283898,21.219063,10.456675929801335,0.103559177781611,24.515537,11.559472330712712,0.1968506852315648,0
1125281431735,2540.0,8.752515809461885,5.337300086897193,0.27703157,23.360136,9.8770982695034,0.485769867902285,7.8596431414000945,15.022379838624422,0.0515153679203586,0.0868924941915411,0.000765180618865624,0.7376963727607299,1.0653919,9.154616093045592,0.0040941213497167,4.4294543,9.646897500689462,0.0137229896018775,8.478179,9.852513436892073,0.0270971665402912,12.170646,10.087294689812198,0.0489676799833785,18.379423,10.679506025472351,0.2040743770811765,1
1133871366236,1304.0,19.154742669274608,5.859948872165777,0.55805635,26.053034,9.904921671463883,0.7990869543290869,6.3461444874767095,14.37671663032197,0.1393655267595005,0.1101920412260098,0.0019198251277578,0.8984207625021929,6.150825,8.756190952259916,0.0264494568255214,17.531254,9.46101156073323,0.0673046741974821,21.343855,9.87434780919104,0.1121720645817284,23.173452,10.28602173528608,0.1756743768288079,24.46182,11.235556791133574,0.3356458427486318,0
1159641170054,1439.0,14.764780104791978,9.376679853244744,0.0500173270702362,28.25498580932617,9.906467882086517,0.4500003800599738,7.776839038612129,12.127667124903471,0.0771233910406961,0.073360932155819,0.0010831082046748,0.7734178114099859,0.22828109562397,9.18016321038499,0.0107551762271457,5.591987609863281,9.709562729605953,0.0271849781947053,15.424273490905762,9.866645902071086,0.0591947049487019,23.98752212524414,10.137274036744143,0.1028320338987925,26.23044776916504,10.671712860751828,0.1997907406012569,1
1211180777517,1679.0,4.771441405954892,4.376222979554796,0.0279656257480382,16.977375030517578,9.790380067507114,0.5854900593925125,5.026376665818665,13.466223628091,0.0775962694774012,0.0788858099635934,0.0015160551339517,0.5641418281465412,0.1645251363515854,8.926855339260472,0.0236760980936618,2.236533164978028,9.550132858188418,0.0379165655102141,2.747764110565185,9.76792485863514,0.0476509992418661,7.738240718841553,10.034411755478438,0.0859381405159161,13.856679916381836,10.713573795493827,0.2413723172264284,1
1228360646715,1289.0,10.388828020078712,6.911426778578952,0.0368379727005958,23.786556243896484,9.945654867895508,0.6221742634734229,7.174647985045568,15.52425186261044,0.1617608910640562,0.1536726715097639,0.0005522433606530815,1.2105769386589718,0.136703535914421,9.008243268424026,0.0092020342338319,3.405956983566284,9.659436935669486,0.0516054317183665,10.68129825592041,9.86505693887842,0.1227407419577085,16.491731643676758,10.266540889048628,0.2231354588712382,20.82914924621582,10.974619936969372,0.4578494290428183,0
1279900254316,1528.0,3.6195460717246783,2.221900117027084,0.1700000017881393,9.390000343322754,11.198731215939466,5.540173218293301,0.4540041357656462,83.28288662996448,0.6879455235162257,0.8323649887123413,0.0020829695485746,9.71538273532383,0.2300000041723251,5.094503288015166,0.0289281310487117,2.2300000190734863,9.056834915521744,0.1636344768769614,3.069999933242798,9.863139468030436,0.4467578790714034,5.389999866485596,12.241650794194294,0.887924832467014,7.769999980926514,19.624143195523416,2.208131420771179,1
1305670058007,1519.0,11.57783588171892,5.432974915498057,0.36926958,21.748781,9.79167406530143,0.8182005754490398,7.268042420681573,16.12818475371637,0.142386543368149,0.1084974326087748,0.0051872411182818,0.8653070966137607,2.3342981,8.453483118494065,0.0248311267733888,7.6026297,9.319936100110125,0.0647750201419266,11.482221,9.773153505054855,0.1160086248598467,15.810855,10.256579632873716,0.1893363178661896,20.106169,11.114308015566072,0.3432845088514549,1
1331439861928,1349.0,8.184182273638607,7.90360012743168,0.0071905837394297,28.32790184020996,9.856010869210294,0.693038812054238,4.513492409583255,14.925993108762372,0.1456675801734549,0.153986325942747,0.0008482580287448734,0.9252165862580466,0.0564892254769802,8.868571552434101,0.014950962647875,1.7864867448806765,9.642415429527242,0.0357249385022357,5.153595924377441,9.799125431523024,0.0849848636871649,12.9419584274292,10.050988112525552,0.2073712288070997,24.82777214050293,10.839842108701031,0.4701941177459874,1
1357209665603,1400.0,12.148970582478626,7.615044201968413,0.1599999964237213,27.200000762939453,9.747399555248991,0.7347030308425884,6.558654555021978,14.493183088839864,0.1275596269051301,0.1059206909571803,0.0010738014128071,0.7575469505649375,0.2000000029802322,8.608147311473813,0.0322920489794913,5.03000020980835,9.44953365954446,0.0513754023706518,13.329999923706056,9.688710178355109,0.0938897077286974,18.06999969482422,10.0751359886979,0.165055887931492,23.71999931335449,10.972951010415516,0.343420426242649,0


# Models

Pipeline stuff:

In [9]:
input_columns = train_df.columns[1:-1]
vec_assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
scaler = StandardScaler(inputCol='features', outputCol='features_std', withStd=True, withMean=True)

lr = LogisticRegression(featuresCol='features_std')
rf = RandomForestClassifier(featuresCol='features_std')
gbt = GBTClassifier(featuresCol='features_std')

Logistic Regression:

In [11]:
%%time
pipeline = Pipeline(stages=[vec_assembler, scaler, lr])

paramGrid = ParamGridBuilder()\
  .addGrid(lr.regParam, [0.1, 0.01])\
  .addGrid(lr.maxIter, [10, 50, 100, 150])\
  .addGrid(lr.elasticNetParam, [0, 0.1, 0.01])\
  .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(metricName='areaUnderROC'),
                          numFolds=10)

cv_lr = crossval.fit(train_df)

auc_lr = max(cv_lr.avgMetrics)

print(f'Best AUC from logistic regression: {auc_lr:.4f}')

Random Forest:

In [13]:
%%time
pipeline = Pipeline(stages=[vec_assembler, scaler, rf])

paramGrid = ParamGridBuilder()\
  .addGrid(rf.maxDepth, [1, 5, 10, 30])\
  .addGrid(rf.numTrees, [10, 20, 50, 100])\
  .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(metricName='areaUnderROC'),
                          numFolds=10)

cv_rf = crossval.fit(train_df)

auc_rf = max(cv_rf.avgMetrics)

print(f'Best AUC from random forest: {auc_rf:.4f}')

GBT:

In [15]:
%%time
pipeline = Pipeline(stages=[vec_assembler, scaler, rf])

paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [5, 10, 20])\
  .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(metricName='areaUnderROC'),
                          numFolds=10)

cv_gbt = crossval.fit(train_df)

auc_gbt = max(cv_gbt.avgMetrics)

print(f'Best AUC from GBT: {auc_gbt:.4f}')

Pick the best model:

In [17]:
aucs = [auc_gbt, auc_lr, auc_rf]
cvs = [cv_gbt, cv_lr, cv_rf]

best_idx = np.argmax(aucs)
best_cv = cvs[best_idx]

Save the best model

In [19]:
%fs
mkdirs /msh/grab/model/

In [20]:
best_cv.bestModel.write().overwrite().save('dbfs:/msh/grab/model/best_model')

In [21]:
%fs
ls /msh/grab/model/

path,name,size
dbfs:/msh/grab/model/best_model/,best_model/,0
