# Setup

In [None]:
!pip install pyspark

In [None]:
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
import numpy as np

In [None]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

# Read the dataset(s)

Make sure to upload the train datasets generated by the `make-dataset.sh` script before proceeding.

In [None]:
!unzip X4.zip
df = spark.read.parquet('./X4.parquet')

In [None]:
!unzip X2.zip
!unzip X3.zip
df1 = spark.read.parquet('./X2.parquet')
df2 = spark.read.parquet('./X3.parquet')
df = df1.union(df2).distinct()

In [None]:
train_set, test_set = df.randomSplit([0.8, 0.2])

In [None]:
def with_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 1 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weight", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

train_set = with_weights(train_set)

# Train the model

In [None]:
model = LinearSVC(threshold=1.1, featuresCol='features', labelCol='label', weightCol='weight', maxIter=100)
estimator = model.fit(train_set)

# Prediction and evaluation

In [None]:
prediction = estimator.transform(test_set)
accuracy = prediction.filter(f.col('label') == f.col('prediction')).count() / prediction.count()
print("Accuracy: ", accuracy)

tp = prediction.filter("label==1 AND prediction==1").count() 
p = tp / prediction.filter('prediction==1').count()
r = tp / prediction.filter('label == 1').count()
f1 = 2 * p * r / (p + r)
print("F1 score: ", f1)

In [None]:
#estimator.save("model-notebooks")
estimator.save("model-products")

In [None]:
!zip -r models.zip model-notebooks model-products

# Extra: hyperparameter optimization

In [None]:
ths = np.linspace(0.5,1.6,num=22)
model = LinearSVC(featuresCol='features', labelCol='label', weightCol='weight',maxIter=100)
param_grid = ParamGridBuilder().addGrid(model.threshold, ths).build()
cvs_2 = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', weightCol='weights', metricName="areaUnderPR"),\
                           numFolds=4)

In [None]:
def stratified_split_train_test(df, frac, label, seed=42):
    """ stratfied split of a dataframe in train and test set."""
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

In [None]:

#training_set, test_set = stratified_split_train_test(df=label_df, frac=0.8, label="label")
#grid_search, hyperpar tuning...
training_set, test_set = stratified_split_train_test(df=label_df,label='label',frac=0.8,seed=42)

In [None]:
estimator_2 = cvs_2.fit(training_set)

In [None]:
prediction_2 = estimator_2.transform(test_set).select('label','prediction')

In [None]:
estimator_2.bestModel.getThreshold()