# Setup

In [1]:
!pip install pyspark



In [112]:
from pyspark.ml.classification import LinearSVC, GBTClassifier, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.sql import types as t
import numpy as np

In [3]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

# Read the dataset(s)

Make sure to upload the train datasets generated by the `make-dataset.sh` script before proceeding.
Either train using X4 or X3,X2

In [None]:
!unzip X4.zip
df = spark.read.parquet('./X4.parquet')

In [8]:
!unzip X2.zip
!y
!unzip X3.zip
!y
df1 = spark.read.parquet('./X2.parquet')
df2 = spark.read.parquet('./X3.parquet')
df = df1.unionAll(df2)
df.toPandas()

Archive:  X2.zip
replace X2.parquet/.part-00001-865d672c-b16b-4632-b255-74ba3c1c9779-c000.snappy.parquet.crc? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
/bin/bash: y: command not found
Archive:  X3.zip
replace X3.parquet/part-00000-d48b5338-63f5-48aa-b777-5ec06012bc6b-c000.snappy.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
/bin/bash: y: command not found


Unnamed: 0,features,label
0,"[0.32666666666666666, 1.0, 1.0, 1.0, 0.2407407...",1
1,"[0.5026178010471204, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1
2,"[0.17842323651452285, 1.0, 1.0, 1.0, 0.3611111...",1
3,"[0.7969348659003832, 1.0, 1.0, 1.0, 1.0, 0.0, ...",1
4,"[0.4024896265560166, 1.0, 1.0, 1.0, 0.24074074...",1
...,...,...
230533,"[0.2574850299401198, 1.0, 1.0, 0.0, 0.21428571...",0
230534,"[0.2857142857142857, 1.0, 1.0, 0.0, 0.28205128...",0
230535,"[0.2544378698224852, 1.0, 0.0, 0.0, 0.07999999...",0
230536,"[0.2544378698224852, 1.0, 1.0, 0.0, 0.21428571...",0


In [104]:
def stratified_split_train_test(df, frac, label,seed=42):
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

train_set, test_set = stratified_split_train_test(df, 0.8, 'label')

def with_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 0.8 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weight", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

train_set = with_weights(train_set)
train_set_base, train_set_comb = stratified_split_train_test(train_set, 0.6, 'label', seed=5476)

# Train the model with ensemble stacking

In [105]:
model_svc = LinearSVC(weightCol='weight', maxIter=100, predictionCol='prediction_svc')
paramGrid = ParamGridBuilder()\
    .addGrid(model_svc.threshold, [1.65,1.7])\
    .build()
cvs = CrossValidator(estimator=model_svc,
                     estimatorParamMaps=paramGrid,
                     evaluator=BinaryClassificationEvaluator(weightCol='weight',metricName='areaUnderPR'),
                     numFolds=4)
estimator_svc = cvs.fit(train_set_base)

In [106]:
model_gbt = GBTClassifier(weightCol='weight', maxBins=4, maxDepth=5, predictionCol='prediction_gbt', lossType='logistic', maxIter=100)
paramGrid = ParamGridBuilder().build()
"""
cvs = TrainValidationSplit(estimator=model_gbt,
                           estimatorParamMaps=paramGrid,
                     evaluator=BinaryClassificationEvaluator(weightCol='weight', metricName='areaUnderPR'),
                     seed=1234,
                     trainRatio=0.8
                    )
"""
estimator_gbt = model_gbt.fit(train_set_base)

In [107]:
train_set_comb.toPandas()

Unnamed: 0,features,label,weight
0,"[0.9907407407407407, 1.0, 1.0, 1.0, 0.99521531...",1,0.000145
1,"[0.9907407407407407, 1.0, 1.0, 1.0, 0.99521531...",1,0.000145
2,"[0.9907407407407407, 1.0, 1.0, 1.0, 0.99521531...",1,0.000145
3,"[0.9907407407407407, 1.0, 1.0, 1.0, 0.99521531...",1,0.000145
4,"[0.7761194029850746, 1.0, 1.0, 1.0, 0.87234042...",1,0.000145
...,...,...,...
73987,"[0.24271844660194175, 1.0, 0.0, 0.5, 0.25, 0.7...",0,0.000006
73988,"(0.2272727272727273, 0.0, 0.0, 0.0, 0.125, 0.4...",0,0.000006
73989,"(0.1843575418994413, 0.0, 1.0, 0.0, 0.07142857...",0,0.000006
73990,"(0.348314606741573, 0.0, 0.0, 0.0, 0.117647058...",0,0.000006


In [119]:
interm = estimator_svc.transform(train_set_comb).drop('rawPrediction').drop('probability')
interm.show()

+--------------------+-----+--------------------+--------------+
|            features|label|              weight|prediction_svc|
+--------------------+-----+--------------------+--------------+
|[0.99074074074074...|    1|1.453224341507720...|           1.0|
|[0.99074074074074...|    1|1.453224341507720...|           1.0|
|[0.99074074074074...|    1|1.453224341507720...|           1.0|
|[0.99074074074074...|    1|1.453224341507720...|           1.0|
|[0.77611940298507...|    1|1.453224341507720...|           0.0|
|[0.76543209876543...|    1|1.453224341507720...|           1.0|
|[0.93162393162393...|    1|1.453224341507720...|           0.0|
|[0.93162393162393...|    1|1.453224341507720...|           0.0|
|[0.42145593869731...|    0|5.589152572686929...|           0.0|
|[0.25941422594142...|    0|5.589152572686929...|           0.0|
|[0.19130434782608...|    0|5.589152572686929...|           0.0|
|(61,[0,2,9,13,17,...|    0|5.589152572686929...|           0.0|
|(61,[0,2,9,13,17,...|   

In [120]:
interm = estimator_gbt.transform(interm).drop('rawPrediction').drop('probability')
interm.show()

+--------------------+-----+--------------------+--------------+--------------+
|            features|label|              weight|prediction_svc|prediction_gbt|
+--------------------+-----+--------------------+--------------+--------------+
|[0.99074074074074...|    1|1.453224341507720...|           1.0|           1.0|
|[0.99074074074074...|    1|1.453224341507720...|           1.0|           1.0|
|[0.99074074074074...|    1|1.453224341507720...|           1.0|           1.0|
|[0.99074074074074...|    1|1.453224341507720...|           1.0|           1.0|
|[0.77611940298507...|    1|1.453224341507720...|           0.0|           1.0|
|[0.76543209876543...|    1|1.453224341507720...|           1.0|           1.0|
|[0.93162393162393...|    1|1.453224341507720...|           0.0|           1.0|
|[0.93162393162393...|    1|1.453224341507720...|           0.0|           1.0|
|[0.42145593869731...|    0|5.589152572686929...|           0.0|           0.0|
|[0.25941422594142...|    0|5.5891525726

In [121]:
interm = interm.drop('features').select('label', 'weight','prediction_svc', 'prediction_gbt')
interm.show()

+-----+--------------------+--------------+--------------+
|label|              weight|prediction_svc|prediction_gbt|
+-----+--------------------+--------------+--------------+
|    1|1.453224341507720...|           1.0|           1.0|
|    1|1.453224341507720...|           1.0|           1.0|
|    1|1.453224341507720...|           1.0|           1.0|
|    1|1.453224341507720...|           1.0|           1.0|
|    1|1.453224341507720...|           0.0|           1.0|
|    1|1.453224341507720...|           1.0|           1.0|
|    1|1.453224341507720...|           0.0|           1.0|
|    1|1.453224341507720...|           0.0|           1.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.

In [123]:
interm = interm.withColumn('prediction_svc', f.col('prediction_svc').cast(t.IntegerType()))
interm = interm.withColumn('prediction_gbt', f.col('prediction_gbt').cast(t.IntegerType()))

@f.udf(returnType=VectorUDT())
def toVec(p1,p2):
    l = [p1,p2]
    return Vectors.dense(l)

interm = interm.withColumn('features', toVec(f.col('prediction_svc'), f.col('prediction_gbt')))
interm.show()

+-----+--------------------+--------------+--------------+---------+
|label|              weight|prediction_svc|prediction_gbt| features|
+-----+--------------------+--------------+--------------+---------+
|    1|1.453224341507720...|             1|             1|[1.0,1.0]|
|    1|1.453224341507720...|             1|             1|[1.0,1.0]|
|    1|1.453224341507720...|             1|             1|[1.0,1.0]|
|    1|1.453224341507720...|             1|             1|[1.0,1.0]|
|    1|1.453224341507720...|             0|             1|[0.0,1.0]|
|    1|1.453224341507720...|             1|             1|[1.0,1.0]|
|    1|1.453224341507720...|             0|             1|[0.0,1.0]|
|    1|1.453224341507720...|             0|             1|[0.0,1.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|      

In [124]:
interm.groupby('features').count().show()

+---------+-----+
| features|count|
+---------+-----+
|[0.0,0.0]|68324|
|[1.0,0.0]|  129|
|[0.0,1.0]| 3422|
|[1.0,1.0]| 2117|
+---------+-----+



In [None]:
model_lr = LogisticRegression(predictionCol='prediction', featuresCol='features',weightCol='weight', maxIter=100)
paramGrid = ParamGridBuilder()\
    .addGrid(model_lr.regParam, [0.5,0.25])\
    .addGrid(model_lr.threshold, [0.9])\
    .build()
cvs = CrossValidator(estimator=model_lr,
                     estimatorParamMaps=paramGrid,
                     evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'),
                     numFolds=4)
estimator_lr = cvs.fit(interm)

# Prediction and evaluation

In [None]:
prediction = estimator_svc.transform(test_set).drop('rawPrediction').drop('probability')
prediction = estimator_gbt.transform(prediction).drop('rawPrediction').drop('probability')
prediction = prediction.drop('features')
prediction = prediction.withColumn('features', toVec(f.col('prediction_svc'), f.col('prediction_gbt')))
prediction = estimator_lr.transform(prediction)
prediction.toPandas()

In [None]:
accuracy = prediction.filter(f.col('label') == f.col('prediction')).count() / prediction.count()
print("Accuracy: ", accuracy)

tp = prediction.filter("label==1 AND prediction==1").count() 
p = tp / prediction.filter('prediction==1').count()
r = tp / prediction.filter('label == 1').count()
f1 = 2 * p * r / (p + r)
print("F1 score: ", f1)

In [None]:
estimator_svc.save("model-notebooks_1")
estimator_gbt.save("model-notebooks_2")
estimator_lr.save("model-notebooks_comb")

In [None]:
estimator_svc.save("model-products_1")
estimator_gbt.save("model-products_2")
estimator_lr.save("model-products_3")

In [None]:
!zip -r models.zip model-notebooks_1 model-notebooks_2 model-notebooks_3# model-products

In [None]:
!zip -r models.zip model-products_1 model-products_2 model-products_3