In [7]:
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

In [8]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [9]:
label_df = spark.read.parquet('./X3.parquet')


def compute_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 1 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weights", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

label_df = compute_weights(label_df)
label_df.toPandas()

Unnamed: 0,features,label,weights
0,"[0.9418604651162791, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1,0.000399
1,"[0.25, 1.0, 1.0, 0.5, 0.017107309486780742, 0....",1,0.000399
2,"[0.18902439024390238, 1.0, 1.0, 1.0, 0.2941176...",1,0.000399
3,"[0.323170731707317, 1.0, 1.0, 0.5, 0.008812856...",1,0.000399
4,"[0.3597560975609756, 1.0, 1.0, 1.0, 0.00922409...",1,0.000399
...,...,...,...
113227,"[0.2574850299401198, 1.0, 1.0, 0.0, 0.21428571...",0,0.000009
113228,"[0.2857142857142857, 1.0, 1.0, 0.0, 0.28205128...",0,0.000009
113229,"[0.2544378698224852, 1.0, 0.0, 0.0, 0.07999999...",0,0.000009
113230,"[0.2544378698224852, 1.0, 1.0, 0.0, 0.21428571...",0,0.000009


3. Machine Learning Magic Bitch

In [10]:
import numpy as np

ths = np.linspace(0.5,1.6,num=22)
model = LinearSVC(featuresCol='features', labelCol='label', weightCol='weights',maxIter=100)
param_grid = ParamGridBuilder().addGrid(model.threshold, ths).build()
"""
cvs_2 = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', weightCol='weights', metricName="areaUnderPR"),\
                           numFolds=4)
"""
cvs_2 = TrainValidationSplit(
    estimator=model,
    estimatorParamMaps=param_grid,
    evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', weightCol='weights', metricName="areaUnderPR")
)

In [11]:
def stratified_split_train_test(df, frac, label, seed=42):
    """ stratfied split of a dataframe in train and test set."""
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

#training_set, test_set = stratified_split_train_test(df=label_df, frac=0.8, label="label")
#grid_search, hyperpar tuning...
ratio = label_df.filter('label==1').count()/label_df.filter('label==0').count()

zeros_df = label_df.filter('label==0').sample(False, ratio)
label_df = label_df.filter('label==1').unionAll(zeros_df)
training_set, test_set = stratified_split_train_test(df=label_df,label='label',frac=0.8,seed=42)

In [12]:
estimator_2 = cvs_2.fit(training_set)

In [13]:
prediction_2 = estimator_2.transform(test_set).select('label','prediction')

In [None]:
accuracy = prediction_2.filter(f.col('label')==f.col('prediction').cast(t.IntegerType())).count() / prediction_2.count()
print("Accuracy: ", accuracy)
p = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('prediction==1').count()
r = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('label == 1').count()
f1 = 2*p*r/(p+r)
print("F1-2 score: ", f1)

In [None]:
estimator_2.save("model-notebooks")
!zip -r model.zip model.model

In [None]:
estimator_2.bestModel.getThreshold()