# GBTree Model

Tuning and testing a GBTree model (most promising after MVP step)

In [26]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from itertools import combinations

%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [27]:
%watermark -iv

pyspark 2.4.3



In [28]:
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '8g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

<pyspark.sql.session.SparkSession.Builder at 0x10b4dbf28>

In [29]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Load data and do some class rebalancing

In [30]:
# Load 10 percent of class 0, and all of class 1 
class0 = spark.read.parquet('../data/model/train0_10pctf.parquet')
class1 = spark.read.parquet('../data/model/train1f.parquet')

class0.count(), class1.count()

(18444704, 456846)

In [31]:
# downsample majority, bootstrap minority
resam0 = class0.sample(fraction=.033, withReplacement=False)
resam1 = class1.sample(fraction=1., withReplacement=True)
resam0.count(), resam1.count()

(607891, 457005)

In [32]:
df = resam1.unionAll(resam0)

# Assemble for Spark

In [33]:
columns = [ dt[0] for dt in df.dtypes ]

# these aren't predictors or targets
columns.remove('index')
columns.remove('ip')
columns.remove('channel')
columns.remove('app')
columns.remove('device')
columns.remove('os')
columns.remove('click_time')
columns.remove('attributed_time')
columns.remove('doy')

df = df[[columns]]

columns.remove('is_attributed')

vec_assembler = VectorAssembler(inputCols=columns, outputCol='features')
vf = vec_assembler.transform(df)
vf = vf[['is_attributed', 'features']]

In [34]:
evaluator = BinaryClassificationEvaluator(labelCol = 'is_attributed')

# GBT Classifier

In [35]:
gbtc = GBTClassifier(
    labelCol = 'is_attributed',
)

pg = ParamGridBuilder(
       ).addGrid(
                gbtc.maxDepth, [8]
       ).addGrid( 
                gbtc.minInstancesPerNode, [16] 
       ).addGrid(
                gbtc.maxIter, [15]
       ).addGrid(
                gbtc.stepSize, [ .7 ]
       ).addGrid(
                gbtc.subsamplingRate, [ .9 ]
       ).addGrid(
                gbtc.featureSubsetStrategy, [ '13' ]
       ).build(
       )

tvs = TrainValidationSplit(
        estimator = gbtc,
        estimatorParamMaps = pg,
        evaluator = evaluator,
        trainRatio = .8
    )

In [36]:
tvs_model = tvs.fit(vf)

results = tvs_model.transform(vf)
evaluator.evaluate(results)

0.9748328509595653

In [37]:
# since we downsampled, running on the full training set is testing with new data 
full = spark.read.parquet('../data/model/trainf.parquet')

fullvf = vec_assembler.transform(full)
fullvf = fullvf[['is_attributed', 'features']]

In [38]:
gbt_model = tvs_model.bestModel
results = gbt_model.transform(fullvf)
evaluator.evaluate(results)

0.9740937381488518

In [39]:
# looks like it generalized OK so far

In [40]:
# confusion matrix
cm = results.groupby(['is_attributed','prediction']).count().collect()
cm

[Row(is_attributed=1, prediction=0.0, count=53071),
 Row(is_attributed=0, prediction=0.0, count=179173626),
 Row(is_attributed=1, prediction=1.0, count=403775),
 Row(is_attributed=0, prediction=1.0, count=5273418)]

# Second model with just top 10 most important features

In [41]:
filist = sorted(list(zip(tvs_model.bestModel.featureImportances,columns)))
filist

[(0.0013476903384891969, 'tgt_device_pct'),
 (0.0014580234138776291, 'device_pct'),
 (0.010024484902413755, 'tgt_device_channel'),
 (0.01608814016932691, 'device_channel'),
 (0.016826250113733888, 'tgt_app_pct'),
 (0.01751087810005689, 'tgt_os_pct'),
 (0.017569781087630767, 'os_pct'),
 (0.017691586003965257, 'device_os'),
 (0.020372078239269755, 'tgt_device_os'),
 (0.020448219420355865, 'tgt_device_app'),
 (0.022272101207732594, 'channel_pct'),
 (0.023659318440610207, 'os_app'),
 (0.03072809073544542, 'os_channel'),
 (0.042281924453536096, 'channel_app'),
 (0.04264979607987805, 'device_app'),
 (0.05386578186496301, 'tgt_os_app'),
 (0.05681795921365693, 'tgt_channel_app'),
 (0.05897792377674138, 'tgt_os_channel'),
 (0.10872088517503842, 'tgt_channel_pct'),
 (0.146796432474551, 'ip_pct'),
 (0.27389265478872715, 'app_pct')]

In [42]:
top_cols =  [ t[1] for t in filist[-10:]]
top_cols

['os_app',
 'os_channel',
 'channel_app',
 'device_app',
 'tgt_os_app',
 'tgt_channel_app',
 'tgt_os_channel',
 'tgt_channel_pct',
 'ip_pct',
 'app_pct']

# Assemble most important features for Spark

In [43]:
top_assembler = VectorAssembler(inputCols=top_cols, outputCol='features')
tvf = top_assembler.transform(df)
tvf = tvf[['is_attributed', 'features']]

In [44]:
# Train model

In [45]:
gbtc = GBTClassifier(
    labelCol = 'is_attributed',
)

pg = ParamGridBuilder(
       ).addGrid(
                gbtc.maxDepth, [8]
       ).addGrid( 
                gbtc.minInstancesPerNode, [16] 
       ).addGrid(
                gbtc.maxIter, [15]
       ).addGrid(
                gbtc.stepSize, [ .7  ]
       ).addGrid(
                gbtc.subsamplingRate, [ .9 ]
       ).addGrid(
                gbtc.featureSubsetStrategy, [ '9' ]
       ).build(
       )

tvs = TrainValidationSplit(
        estimator = gbtc,
        estimatorParamMaps = pg,
        evaluator = evaluator,
        trainRatio = .8
    )

tvs_model = tvs.fit(tvf)
results = tvs_model.transform(tvf)
evaluator.evaluate(results)

0.9746221678309072

# Second model on the full dataset

In [46]:
tvf_full = top_assembler.transform(full)
gbt_model = tvs_model.bestModel
results = gbt_model.transform(tvf_full)
evaluator.evaluate(results)

0.973878306276408

In [47]:
# confusion matrix
cm = results.groupby(['is_attributed','prediction']).count().collect()
cm

[Row(is_attributed=1, prediction=0.0, count=53385),
 Row(is_attributed=0, prediction=0.0, count=179295637),
 Row(is_attributed=1, prediction=1.0, count=403461),
 Row(is_attributed=0, prediction=1.0, count=5151407)]

In [48]:
# results are roughly comparable 

In [49]:
# barrier so I don't hit return too many times and kill my spark session :-)
assert(0)

AssertionError: 

In [50]:
spark.stop()