# Test Submission

The holdout is held by Kaggle, which wants answers uploaded in a particular format.

Using the best hyperparameters from the GBTree notebook:
1. Create three test sets with different random seeds
2. Train three models 
3. Run the kaggle test set through the three models
4. Choose the median answer as the final answer
5. Upload to kaggle to see the result

In [1]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import pandas as pd
import numpy as np

In [2]:
%load_ext watermark
%watermark -iv

numpy   1.16.2
pandas  0.24.2
pyspark 2.4.3



In [3]:
# Comment these out to run on a cluster. Also, adjust memory to size of your laptop
pyspark.sql.SparkSession.builder.config('spark.driver.memory', '8g')
pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.paritions', 5)

<pyspark.sql.session.SparkSession.Builder at 0x10e87fc18>

In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Load data and do some class rebalancing

In [5]:
# Load 10 percent of class 0, and all of class 1 
class0 = spark.read.parquet('../data/model/train0_10pctf.parquet')
class1 = spark.read.parquet('../data/model/train1f.parquet')
test = spark.read.parquet('../data/model/testf.parquet')

class0.count(), class1.count(), test.count()

(18444704, 456846, 18790469)

In [6]:
# downsample majority, bootstrap minority
resam0_a = class0.sample(fraction=.033, withReplacement=False, seed=111)
resam1_a = class1.sample(fraction=1., withReplacement=True, seed=111)
df_a = resam0_a.unionAll(resam1_a)

resam0_b = class0.sample(fraction=.033, withReplacement=False, seed=222)
resam1_b = class1.sample(fraction=1., withReplacement=True, seed=222)
df_b = resam0_b.unionAll(resam1_b)

resam0_c = class0.sample(fraction=.033, withReplacement=False, seed=333)
resam1_c = class1.sample(fraction=1., withReplacement=True, seed=333)
df_c = resam0_c.unionAll(resam1_c)

# Assemble for Spark 

In [7]:
columns = [ dt[0] for dt in df_a.dtypes ]

columns.remove('index')
columns.remove('ip')
columns.remove('channel')
columns.remove('app')
columns.remove('device')
columns.remove('os')
columns.remove('click_time')
columns.remove('attributed_time')
columns.remove('doy')
columns.remove('is_attributed')

vec_assembler = VectorAssembler(inputCols=columns, outputCol='features')

In [8]:
vf_a = vec_assembler.transform(df_a)
vf_a = vf_a[['is_attributed', 'features']]

vf_b = vec_assembler.transform(df_b)
vf_b = vf_b[['is_attributed', 'features']]

vf_c = vec_assembler.transform(df_c)
vf_c = vf_c[['is_attributed', 'features']]

In [9]:
vf_test = vec_assembler.transform(test)
vf_test = vf_test[['click_id', 'features']]

In [11]:
# Assemble for Spark - top 10 features version
top_cols = ['os_app',
 'os_channel',
 'channel_app',
 'device_app',
 'tgt_os_app',
 'tgt_channel_app',
 'tgt_os_channel',
 'tgt_channel_pct',
 'ip_pct',
 'app_pct']
top_assembler = VectorAssembler(inputCols=top_cols, outputCol = 'features')

topv_a = top_assembler.transform(df_a)
topv_a = topv_a[['is_attributed','features']]

topv_b = top_assembler.transform(df_b)
topv_b = topv_b[['is_attributed','features']]

topv_c = top_assembler.transform(df_c)
topv_c = topv_c[['is_attributed','features']]

topv_test = top_assembler.transform(test)
topv_test = topv_test[['click_id', 'features']]

# GBT Classifier -- all features

In [12]:
gbtc = GBTClassifier(
    labelCol = 'is_attributed',
    maxDepth = 8,
    minInstancesPerNode = 16,
    maxIter = 15,
    stepSize = 0.7,
    subsamplingRate = .9,
    featureSubsetStrategy = '13'
)

In [13]:
model_a = gbtc.fit(vf_a)
model_b = gbtc.fit(vf_b)
model_c = gbtc.fit(vf_c)

In [None]:
## save models for possible stacking later

In [15]:
model_a.save('../data/model/model_a.model')
model_b.save('../data/model/model_b.model')
model_c.save('../data/model/model_c.model')

In [None]:
# Run test data through the models

In [16]:
results_a = model_a.transform(vf_test)
results_b = model_b.transform(vf_test)
results_c = model_c.transform(vf_test)

In [None]:
# Create csv to submit to kaggle

In [30]:
# get the median probability of class 1
# from three probability vectors 
def get_median(a, b, c):
    a = a[1]
    b = b[1]
    c = c[1]
    if a > b and a < c:
        return float(a)
    if b > a and b < c:
        return float(b)
    return float(c)

udf_get_median = F.udf(get_median, T.FloatType())

def merge_results(a, b, c):

    a.createOrReplaceTempView('a_result')
    b.createOrReplaceTempView('b_result')
    c.createOrReplaceTempView('c_result')

    join = spark.sql("""
    
    SELECT  a_result.click_id    AS click_id, 
            a_result.probability AS a_prob, 
            b_result.probability AS b_prob, 
            c_result.probability AS c_prob
    FROM
             a_result 
        JOIN b_result 
          ON a_result.click_id = b_result.click_id
        JOIN c_result
          ON a_result.click_id = c_result.click_id
    """)
    
    join.show(5)

    result = join.select('click_id',
         udf_get_median('a_prob','b_prob','c_prob').alias('is_attributed'))

    result.show(5)
    return result

In [22]:
result = merge_results(results_a, results_b, results_c)

+--------+--------------------+--------------------+--------------------+
|click_id|              a_prob|              b_prob|              c_prob|
+--------+--------------------+--------------------+--------------------+
|     148|[0.94539376085098...|[0.94779208087944...|[0.94379030731756...|
|     463|[0.94176715437250...|[0.94205247058463...|[0.94349557916544...|
|     471|[0.97981014861923...|[0.98209989159970...|[0.98021108158768...|
|     496|[0.96949655694921...|[0.97259814020590...|[0.97057270775433...|
|     833|[0.98114177498547...|[0.97768255943114...|[0.97394636623424...|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows

+--------+-------------+
|click_id|is_attributed|
+--------+-------------+
|     148|   0.05460624|
|     463|   0.05650442|
|     471|  0.019788919|
|     496|  0.029427292|
|     833|  0.022317441|
+--------+-------------+
only showing top 5 rows



In [23]:
result.write.csv('../data/kaggle/submit_full.csv')

# GBT Classifier - Top 10 features

In [24]:
gbtc = GBTClassifier(
    labelCol = 'is_attributed',
    maxDepth = 8,
    minInstancesPerNode = 16,
    maxIter = 15,
    stepSize = 0.7,
    subsamplingRate = .9,
    featureSubsetStrategy = '9'
)

In [25]:
model_a = gbtc.fit(topv_a)
model_b = gbtc.fit(topv_b)
model_c = gbtc.fit(topv_c)

In [26]:
## save models for possible stacking later

In [27]:
model_a.save('../data/model/topf_model_a.model')
model_b.save('../data/model/topf_model_b.model')
model_c.save('../data/model/topf_model_c.model')

In [None]:
# Run test data through the models

In [28]:
results_a = model_a.transform(topv_test)
results_b = model_b.transform(topv_test)
results_c = model_c.transform(topv_test)

In [32]:
# Create csv to submit to kaggle

+--------+-------------+
|click_id|is_attributed|
+--------+-------------+
|     148|   0.06027116|
|     463|  0.057876226|
|     471|  0.020789148|
|     496|   0.02852715|
|     833|  0.022072932|
+--------+-------------+
only showing top 5 rows



In [None]:
result = merge_results(results_a, results_b, results_c)
result.show(5)

In [33]:
result.write.csv('../data/kaggle/submit_topfeat.csv')

In [144]:
# barrier so I don't hit return too many times and kill my spark session :-)
assert(0)

AssertionError: 

In [34]:
spark.stop()