# Setup

In [1]:
!pip install pyspark



In [8]:
from pyspark.ml.classification import LinearSVC, GBTClassifier, LogisticRegression, LinearSVCModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.sql import types as t
import numpy as np

In [None]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

# Read the dataset(s)

Make sure to upload the train datasets generated by the `make-dataset.sh` script before proceeding.
Either train using X4 or X3,X2

In [26]:
!unzip X4.zip

Archive:  X4.zip
replace X4.parquet/.part-00000-34b7377e-848c-4866-bfc8-33e858f2cd38-c000.snappy.parquet.crc? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [38]:
df = spark.read.parquet('./X4.parquet')

In [2]:
df1 = spark.read.parquet('./X2.parquet')
df2 = spark.read.parquet('./X3.parquet')
df = df1.unionAll(df2)

In [None]:
df.toPandas()

In [3]:
def stratified_split_train_test(df, frac, label,seed=42):
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

train_set, test_set = stratified_split_train_test(df, 0.8, 'label')

def with_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 0.5 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weight", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

train_set = with_weights(train_set)
train_set_base, train_set_comb = stratified_split_train_test(train_set, 0.6, 'label', seed=5476)

# Train the model with ensemble stacking

In [7]:
model_svc = LinearSVC(weightCol='weight', maxIter=10000, predictionCol='prediction_svc')
paramGrid = ParamGridBuilder()\
    .addGrid(model_svc.threshold, [1.3,1.4,1.5,1.55,1.6,1.65,1.7,1.75,1.8])\
    .build()
cvs = CrossValidator(estimator=model_svc,
                     estimatorParamMaps=paramGrid,
                     evaluator=BinaryClassificationEvaluator(weightCol='weight',metricName='areaUnderPR'),
                     numFolds=4)
estimator_svc = cvs.fit(train_set_base)

In [8]:
estimator_svc.bestModel.save("model-notebooks_1")

In [4]:
model_gbt = GBTClassifier(weightCol='weight', maxIter=100, predictionCol='prediction_gbt', lossType='logistic')
paramGrid = ParamGridBuilder()\
    .addGrid(model_gbt.maxDepth, [4,6,8])\
    .addGrid(model_gbt.maxBins, [4,6,8])\
    .build()

cvs = CrossValidator(estimator=model_gbt,
                           estimatorParamMaps=paramGrid,
                     evaluator=BinaryClassificationEvaluator(weightCol='weight', metricName='areaUnderPR'),
                     numFolds=4,
                    )

estimator_gbt = cvs.fit(train_set_base)

In [5]:
estimator_gbt.bestModel.save("model-notebooks_2")

In [9]:
estimator_svc = LinearSVCModel.load("model-notebooks_1")

In [10]:
interm = estimator_svc.transform(train_set_comb).drop('rawPrediction').drop('probability')
interm = estimator_gbt.transform(interm).drop('rawPrediction').drop('probability')
interm.groupby('prediction_svc', 'prediction_gbt').count().show()

+--------------+--------------+-----+
|prediction_svc|prediction_gbt|count|
+--------------+--------------+-----+
|           1.0|           1.0| 1146|
|           0.0|           1.0| 1777|
|           1.0|           0.0|  346|
|           0.0|           0.0|70723|
+--------------+--------------+-----+



In [11]:
interm = interm.drop('features').select('label', 'weight','prediction_svc', 'prediction_gbt')
interm.show()

+-----+--------------------+--------------+--------------+
|label|              weight|prediction_svc|prediction_gbt|
+-----+--------------------+--------------+--------------+
|    1|9.082652134423252E-5|           0.0|           1.0|
|    1|9.082652134423252E-5|           0.0|           1.0|
|    1|9.082652134423252E-5|           0.0|           1.0|
|    1|9.082652134423252E-5|           1.0|           1.0|
|    1|9.082652134423252E-5|           0.0|           1.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.0|
|    0|5.589152572686929...|           0.0|           0.

In [12]:
interm = interm.withColumn('prediction_svc', f.col('prediction_svc').cast(t.IntegerType()))
interm = interm.withColumn('prediction_gbt', f.col('prediction_gbt').cast(t.IntegerType()))

@f.udf(returnType=VectorUDT())
def toVec(p1,p2):
    l = [p1,p2]
    return Vectors.dense(l)

interm = interm.withColumn('features', toVec(f.col('prediction_svc'), f.col('prediction_gbt')))
interm.show()

+-----+--------------------+--------------+--------------+---------+
|label|              weight|prediction_svc|prediction_gbt| features|
+-----+--------------------+--------------+--------------+---------+
|    1|9.082652134423252E-5|             0|             1|[0.0,1.0]|
|    1|9.082652134423252E-5|             0|             1|[0.0,1.0]|
|    1|9.082652134423252E-5|             0|             1|[0.0,1.0]|
|    1|9.082652134423252E-5|             1|             1|[1.0,1.0]|
|    1|9.082652134423252E-5|             0|             1|[0.0,1.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|             0|             0|[0.0,0.0]|
|    0|5.589152572686929...|      

In [13]:
model_lr = LogisticRegression(predictionCol='prediction', featuresCol='features',weightCol='weight', maxIter=10000)
paramGrid = ParamGridBuilder()\
    .addGrid(model_lr.regParam, [0.5,0.4,0.3,0.2,0.1])\
    .addGrid(model_lr.threshold, [0.1,0.2,0.3,0.4,0.45,0.5,0.55,0.60,0.65,0.7,0.75,0.8,0.9])\
    .build()
cvs = CrossValidator(estimator=model_lr,
                     estimatorParamMaps=paramGrid,
                     evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'),
                     numFolds=4)
estimator_lr = cvs.fit(interm)

In [17]:
estimator_lr.bestModel.save("model-notebooks_comb")

# Prediction and evaluation

In [15]:
prediction = estimator_svc.transform(test_set).drop('rawPrediction').drop('probability')
prediction = estimator_gbt.transform(prediction).drop('rawPrediction').drop('probability')
prediction = prediction.drop('features')
prediction = prediction.withColumn('features', toVec(f.col('prediction_svc'), f.col('prediction_gbt')))
prediction = estimator_lr.transform(prediction)
prediction.toPandas()

Unnamed: 0,label,prediction_svc,prediction_gbt,features,rawPrediction,probability,prediction
0,1,1.0,1.0,"[1.0, 1.0]","[-0.6748328389596705, 0.6748328389596705]","[0.33741553338800373, 0.6625844666119962]",1.0
1,1,1.0,1.0,"[1.0, 1.0]","[-0.6748328389596705, 0.6748328389596705]","[0.33741553338800373, 0.6625844666119962]",1.0
2,1,1.0,1.0,"[1.0, 1.0]","[-0.6748328389596705, 0.6748328389596705]","[0.33741553338800373, 0.6625844666119962]",1.0
3,1,0.0,1.0,"[0.0, 1.0]","[0.07165219653988708, -0.07165219653988708]","[0.5179053892124255, 0.48209461078757454]",1.0
4,1,0.0,1.0,"[0.0, 1.0]","[0.07165219653988708, -0.07165219653988708]","[0.5179053892124255, 0.48209461078757454]",1.0
...,...,...,...,...,...,...,...
46110,0,0.0,0.0,"[0.0, 0.0]","[1.2889522710181065, -1.2889522710181065]","[0.7839697973822849, 0.2160302026177151]",1.0
46111,0,0.0,0.0,"[0.0, 0.0]","[1.2889522710181065, -1.2889522710181065]","[0.7839697973822849, 0.2160302026177151]",1.0
46112,0,0.0,0.0,"[0.0, 0.0]","[1.2889522710181065, -1.2889522710181065]","[0.7839697973822849, 0.2160302026177151]",1.0
46113,0,0.0,0.0,"[0.0, 0.0]","[1.2889522710181065, -1.2889522710181065]","[0.7839697973822849, 0.2160302026177151]",1.0


In [20]:
accuracy = prediction.filter(f.col('label') == f.col('prediction_gbt')).count() / prediction.count()
print("Accuracy: ", accuracy)

tp = prediction.filter("label==1 AND prediction_gbt==1").count()
p = tp / prediction.filter('prediction_gbt==1').count()
r = tp / prediction.filter('label == 1').count()
f1 = 2 * p * r / (p + r)
print("precision", p)
print("recall", r)
print("F1 score: ", f1)

Accuracy:  0.9871842133795945
precision 0.6985539488320356
recall 0.9624521072796934
F1 score:  0.8095391556558169


In [None]:
estimator_svc.bestModel.save("model-notebooks_1")
estimator_gbt.bestModel.save("model-notebooks_2")
estimator_lr.bestModel.save("model-notebooks_comb")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45389)
Traceback (most recent call last):
  File "/home/intx/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/intx/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/home/intx/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/intx/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 977,

In [48]:
estimator_svc.bestModel.save("model-products_1")
estimator_gbt.bestModel.save("model-products_2")
estimator_lr.bestModel.save("model-products_comb")

In [None]:
!zip -r models.zip model-notebooks_1 model-notebooks_2 model-notebooks_comb # model-products

In [49]:
!zip -r models.zip model-products_1 model-products_2 model-products_comb

updating: model-products_1/ (stored 0%)
updating: model-products_1/metadata/ (stored 0%)
updating: model-products_1/metadata/.part-00000.crc (stored 0%)
updating: model-products_1/metadata/part-00000 (deflated 43%)
updating: model-products_1/metadata/._SUCCESS.crc (stored 0%)
updating: model-products_1/metadata/_SUCCESS (stored 0%)
updating: model-products_2/ (stored 0%)
updating: model-products_2/metadata/ (stored 0%)
updating: model-products_2/metadata/.part-00000.crc (stored 0%)
updating: model-products_2/metadata/part-00000 (deflated 46%)
updating: model-products_2/metadata/._SUCCESS.crc (stored 0%)
updating: model-products_2/metadata/_SUCCESS (stored 0%)
updating: model-products_comb/ (stored 0%)
updating: model-products_comb/metadata/ (stored 0%)
updating: model-products_comb/metadata/.part-00000.crc (stored 0%)
updating: model-products_comb/metadata/part-00000 (deflated 48%)
updating: model-products_comb/metadata/._SUCCESS.crc (stored 0%)
updating: model-product