# MVP

## Minimum Viable Project

Taking my features and running the toy sample data through the pyspark classifiers to see which looks most promising.

From this notebook, selected the GBTree model as the one to train/tune with more data.

In [63]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [64]:
from itertools import combinations

In [65]:
%load_ext watermark
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pyspark 2.4.3



In [66]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '8g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

<pyspark.sql.session.SparkSession.Builder at 0x10def3d68>

In [67]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Load the (feature-engineered) 'train_sample' 

In [68]:
df = spark.read.parquet('../data/model/train_samplef.parquet')

In [69]:
columns = [ dt[0] for dt in df.dtypes ]

columns.remove('ip')
columns.remove('channel')
columns.remove('app')
columns.remove('device')
columns.remove('os')
columns.remove('click_time')
columns.remove('attributed_time')
columns.remove('doy')

df = df[[columns]]

columns.remove('is_attributed')

In [70]:
df.count()

100000

In [71]:
df.groupby('is_attributed').count().show()

+-------------+-----+
|is_attributed|count|
+-------------+-----+
|            1|  227|
|            0|99773|
+-------------+-----+



In [72]:
vec_assembler = VectorAssembler(inputCols=columns, outputCol='features')
vf = vec_assembler.transform(df)
vf = vf[['is_attributed', 'features']]
vf.show(2)

+-------------+--------------------+
|is_attributed|            features|
+-------------+--------------------+
|            0|[1.0,0.9004319731...|
|            0|[1.0,0.2181837572...|
+-------------+--------------------+
only showing top 2 rows



In [73]:
evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed')

# Random Forest

In [55]:
rfc = RandomForestClassifier(
    labelCol = 'is_attributed',
)

# Preparting for future hyperparameter tuning
pg = ParamGridBuilder(
       ).addGrid(
                rfc.numTrees, [20]
       ).addGrid(
                rfc.maxDepth, [5, 7]
       ).addGrid(
                rfc.minInstancesPerNode, [32]
       ).addGrid(
                rfc.maxBins, [128]
       ).addGrid(
                rfc.subsamplingRate, [.8 ]
       ).addGrid(
                rfc.featureSubsetStrategy, [ '.75' ]
       ).build(
       )

tvs = TrainValidationSplit(
        estimator = rfc,
        estimatorParamMaps = pg,
        evaluator = evaluator,
        trainRatio = .8
    )

tvs_model = tvs.fit(vf)
results = tvs_model.transform(vf)
evaluator.evaluate(results)

0.9376391457065689

In [56]:
params = tvs_model.bestModel.extractParamMap()
for k in params.keys():
    print(k.name, params[k])

cacheNodeIds False
checkpointInterval 10
featureSubsetStrategy .75
featuresCol features
impurity gini
labelCol is_attributed
maxBins 128
maxDepth 5
maxMemoryInMB 256
minInfoGain 0.0
minInstancesPerNode 32
numTrees 20
predictionCol prediction
probabilityCol probability
rawPredictionCol rawPrediction
seed -2595380024694023377
subsamplingRate 0.8


# GBT Classifier

In [57]:
gbtc = GBTClassifier(
    labelCol = 'is_attributed',
)

# Preparting for future hyperparameter tuning
pg = ParamGridBuilder(
       ).addGrid(
                gbtc.maxDepth, [8]
       ).addGrid(
                gbtc.minInstancesPerNode, [32]
       ).addGrid(
                gbtc.maxIter, [10]
       ).addGrid(
                gbtc.subsamplingRate, [.8 ]
       ).addGrid(
                gbtc.featureSubsetStrategy, ['.5', '.7', '.9']
       ).build(
       )

tvs = TrainValidationSplit(
        estimator = gbtc,
        estimatorParamMaps = pg,
        evaluator = evaluator,
        trainRatio = .8
    )

tvs_model = tvs.fit(vf)
results = tvs_model.transform(vf)
evaluator.evaluate(results)

0.994050326841048

In [58]:
params = tvs_model.bestModel.extractParamMap()
for k in params.keys():
    print(k.name, params[k])

cacheNodeIds False
checkpointInterval 10
featureSubsetStrategy .5
featuresCol features
labelCol is_attributed
lossType logistic
maxBins 32
maxDepth 8
maxIter 10
maxMemoryInMB 256
minInfoGain 0.0
minInstancesPerNode 32
predictionCol prediction
seed -5542985121037825445
stepSize 0.1
subsamplingRate 0.8


In [59]:
# MLP Classifier

In [74]:
mlpc = MultilayerPerceptronClassifier(
    labelCol = 'is_attributed',
)

pg = ParamGridBuilder(
       ).addGrid(
            mlpc.solver, ['l-bfgs']
       ).addGrid(
            mlpc.layers, 
                [ (21,10,2), (21,15,2), (21,20,2) ]
       ).build(
       )

tvs = TrainValidationSplit(
        estimator = mlpc,
        estimatorParamMaps = pg,
        evaluator = evaluator,
        trainRatio = .8
    )

tvs_model = tvs.fit(vf)
results = tvs_model.transform(vf)
evaluator.evaluate(results)

0.9689324060765065

In [76]:
tvs_model.bestModel.layers

[21, 15, 2]

In [28]:
# barrier so I don't hit return too many times and kill my spark session :-)
assert(0)

AssertionError: 

In [77]:
spark.stop()