# Setup

In [1]:
!pip install pyspark



In [1]:
from pyspark.ml.classification import LinearSVC, GBTClassifier, LogisticRegression, LinearSVCModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.sql import types as t
import numpy as np

In [2]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

# Read the dataset(s)

Make sure to upload the train datasets generated by the `make-dataset.sh` script before proceeding.
Either train using X4 or X3,X2

In [26]:
!unzip X4.zip

Archive:  X4.zip
replace X4.parquet/.part-00000-34b7377e-848c-4866-bfc8-33e858f2cd38-c000.snappy.parquet.crc? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [23]:
df = spark.read.parquet('./X4.parquet')

In [11]:
df1 = spark.read.parquet('./X2.parquet')
df2 = spark.read.parquet('./X3.parquet')
df = df1.unionAll(df2)

In [24]:
df.toPandas()

Unnamed: 0,features,label
0,"(0.20588235294117652, 0.0, 0.0, 0.180557401558...",0
1,"(0.18627450980392157, 0.0, 0.0, 0.174839178165...",0
2,"(0.20588235294117652, 0.0, 0.0, 0.162868725979...",0
3,"(0.17647058823529416, 0.0, 0.0, 0.209617185285...",0
4,"(0.17647058823529416, 0.0, 0.0, 0.501001152954...",0
...,...,...
696385,"[0.5227272727272727, 0.6666666666666666, 0.435...",0
696386,"[0.09375, 0.3333333333333333, 0.13710206615031...",0
696387,"[0.5681818181818181, 0.5, 0.5566483580341014, ...",0
696388,"[0.5641025641025641, 0.5, 0.7188372351526294, ...",0


In [25]:
def stratified_split_train_test(df, frac, label,seed=42):
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

train_set, test_set = stratified_split_train_test(df, 0.8, 'label')

def with_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 0.5 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weight", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

train_set = with_weights(train_set)


# Train the model gbt

In [17]:
model_gbt = GBTClassifier(weightCol='weight', maxIter=200, predictionCol='prediction', lossType='logistic')
paramGrid = ParamGridBuilder()\
    .addGrid(model_gbt.maxDepth, [3,5,7])\
    .addGrid(model_gbt.maxBins, [24])\
    .build()

cvs = CrossValidator(estimator=model_gbt,
                           estimatorParamMaps=paramGrid,
                     evaluator=BinaryClassificationEvaluator(weightCol='weight', metricName='areaUnderPR'),
                     numFolds=4,
                    )

estimator_gbt = cvs.fit(train_set)

In [21]:
estimator_gbt.bestModel.save("model-notebooks")

+--------------+--------------+-----+
|prediction_svc|prediction_gbt|count|
+--------------+--------------+-----+
|           1.0|           1.0| 1146|
|           0.0|           1.0| 1777|
|           1.0|           0.0|  346|
|           0.0|           0.0|70723|
+--------------+--------------+-----+



# Prediction and evaluation

In [19]:
prediction = estimator_gbt.transform(test_set).drop('rawPrediction').drop('probability')
prediction.toPandas()

Unnamed: 0,features,label,prediction
0,"[0.9236111111111112, 1.0, 1.0, 1.0, 0.26556016...",1,1.0
1,"[0.7943262411347518, 1.0, 1.0, 1.0, 0.88016528...",1,1.0
2,"[0.7943262411347518, 1.0, 1.0, 1.0, 0.88016528...",1,1.0
3,"[0.9907407407407407, 1.0, 1.0, 1.0, 0.99521531...",1,1.0
4,"[0.962962962962963, 1.0, 1.0, 1.0, 0.980861244...",1,1.0
...,...,...,...
46110,"[0.34444444444444444, 0.0, 1.0, 1.0, 1.0, 0.54...",0,0.0
46111,"[0.18888888888888888, 0.0, 1.0, 0.0, 0.375, 0....",0,0.0
46112,"(0.1785714285714286, 0.0, 0.0, 0.0, 0.17948717...",0,0.0
46113,"[0.12195121951219512, 0.0, 1.0, 0.0, 0.3589743...",0,0.0


In [20]:
accuracy = prediction.filter(f.col('label') == f.col('prediction')).count() / prediction.count()
print("Accuracy: ", accuracy)

tp = prediction.filter("label==1 AND prediction==1").count()
p = tp / prediction.filter('prediction==1').count()
r = tp / prediction.filter('label == 1').count()
f1 = 2 * p * r / (p + r)
print("precision", p)
print("recall", r)
print("F1 score: ", f1)

Accuracy:  0.9941884419386317
precision 0.8360337005832793
recall 0.9885057471264368
F1 score:  0.9058988764044943


In [22]:
!zip -r -9 models.zip model-notebooks

updating: model-notebooks/ (stored 0%)
updating: model-notebooks/data/ (stored 0%)
updating: model-notebooks/data/._SUCCESS.crc (stored 0%)
updating: model-notebooks/data/_SUCCESS (stored 0%)
updating: model-notebooks/treesMetadata/ (stored 0%)
updating: model-notebooks/treesMetadata/._SUCCESS.crc (stored 0%)
updating: model-notebooks/treesMetadata/_SUCCESS (stored 0%)
updating: model-notebooks/metadata/ (stored 0%)
updating: model-notebooks/metadata/.part-00000.crc (stored 0%)
updating: model-notebooks/metadata/part-00000 (deflated 46%)
updating: model-notebooks/metadata/._SUCCESS.crc (stored 0%)
updating: model-notebooks/metadata/_SUCCESS (stored 0%)
  adding: model-notebooks/data/part-00003-f4325da5-1a9d-47d5-915f-be1a524b0c0d-c000.snappy.parquet (deflated 6%)
  adding: model-notebooks/data/.part-00003-f4325da5-1a9d-47d5-915f-be1a524b0c0d-c000.snappy.parquet.crc (stored 0%)
  adding: model-notebooks/data/.part-00002-f4325da5-1a9d-47d5-915f-be1a524b0c0d-c000.snappy.parq

In [10]:
!zip -r -9 models.zip model-products

updating: model-products/ (stored 0%)
updating: model-products/data/ (stored 0%)
updating: model-products/data/._SUCCESS.crc (stored 0%)
updating: model-products/data/_SUCCESS (stored 0%)
updating: model-products/metadata/ (stored 0%)
updating: model-products/metadata/part-00000 (deflated 46%)
updating: model-products/metadata/._SUCCESS.crc (stored 0%)
updating: model-products/metadata/.part-00000.crc (stored 0%)
updating: model-products/metadata/_SUCCESS (stored 0%)
updating: model-products/treesMetadata/ (stored 0%)
updating: model-products/treesMetadata/._SUCCESS.crc (stored 0%)
updating: model-products/treesMetadata/_SUCCESS (stored 0%)
  adding: model-products/data/.part-00002-b0ab9031-40b6-4139-a95a-26e4dc857b3c-c000.snappy.parquet.crc (stored 0%)
  adding: model-products/data/part-00000-b0ab9031-40b6-4139-a95a-26e4dc857b3c-c000.snappy.parquet (deflated 10%)
  adding: model-products/data/part-00001-b0ab9031-40b6-4139-a95a-26e4dc857b3c-c000.snappy.parquet (deflated 9