In [24]:
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

In [25]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [26]:
label_1 = spark.read.parquet('./X3.parquet')
label_2 = spark.read.parquet('./X2.parquet')
label_df = label_1.union(label_2).distinct()

def compute_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 1 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weights", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

label_df = compute_weights(label_df)
label_df.toPandas()

Unnamed: 0,features,label,weights
0,"[0.23529411764705888, 0.0, 1.0, 0.0, 0.2352941...",0,0.000011
1,"(0.18604651162790697, 0.0, 0.0, 0.0, 0.25, 0.6...",0,0.000011
2,"[0.19999999999999996, 0.0, 1.0, 0.0, 0.2352941...",0,0.000011
3,"[0.21176470588235297, 0.0, 1.0, 0.5, 0.1739130...",0,0.000011
4,"(0.21764705882352942, 0.0, 1.0, 0.0, 0.0, 0.0,...",0,0.000011
...,...,...,...
95691,"[0.7291666666666667, 1.0, 1.0, 0.5, 0.77593360...",0,0.000011
95692,"(0.4305555555555556, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0.000011
95693,"(0.5811965811965811, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0.000011
95694,"[0.20710059171597628, 0.0, 1.0, 0.5, 0.1490384...",0,0.000011


3. Machine Learning Magic Bitch

In [29]:
import numpy as np

ths = np.linspace(0.5,1.6,num=22)
model = LinearSVC(featuresCol='features', labelCol='label', weightCol='weights',maxIter=100)
param_grid = ParamGridBuilder().addGrid(model.threshold, ths).build()
cvs_2 = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', weightCol='weights', metricName="areaUnderPR"),\
                           numFolds=4)

In [30]:
def stratified_split_train_test(df, frac, label, seed=42):
    """ stratfied split of a dataframe in train and test set."""
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

#training_set, test_set = stratified_split_train_test(df=label_df, frac=0.8, label="label")
#grid_search, hyperpar tuning...
training_set, test_set = stratified_split_train_test(df=label_df,label='label',frac=0.8,seed=42)

In [None]:
estimator_2 = cvs_2.fit(training_set)

In [None]:
prediction_2 = estimator_2.transform(test_set).select('label','prediction')

In [None]:
accuracy = prediction_2.filter(f.col('label')==f.col('prediction').cast(t.IntegerType())).count() / prediction_2.count()
print("Accuracy: ", accuracy)
p = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('prediction==1').count()
r = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('label == 1').count()
f1 = 2*p*r/(p+r)
print("F1-2 score: ", f1)

In [19]:
#estimator_2.save("model-notebooks")
!zip -r models.zip model-notebooks

updating: model-notebooks/ (stored 0%)
updating: model-notebooks/metadata/ (stored 0%)
updating: model-notebooks/metadata/part-00000 (deflated 79%)
updating: model-notebooks/metadata/.part-00000.crc (stored 0%)
updating: model-notebooks/metadata/._SUCCESS.crc (stored 0%)
updating: model-notebooks/metadata/_SUCCESS (stored 0%)
  adding: model-notebooks/bestModel/ (stored 0%)
  adding: model-notebooks/bestModel/data/ (stored 0%)
  adding: model-notebooks/bestModel/data/.part-00000-111c5853-e320-40e3-875a-a4edf9407f22-c000.snappy.parquet.crc (stored 0%)
  adding: model-notebooks/bestModel/data/._SUCCESS.crc (stored 0%)
  adding: model-notebooks/bestModel/data/part-00000-111c5853-e320-40e3-875a-a4edf9407f22-c000.snappy.parquet (deflated 43%)
  adding: model-notebooks/bestModel/data/_SUCCESS (stored 0%)
  adding: model-notebooks/bestModel/metadata/ (stored 0%)
  adding: model-notebooks/bestModel/metadata/.part-00000.crc (stored 0%)
  adding: model-notebooks/bestModel/metadata/

In [18]:
estimator_2.bestModel.getThreshold()



1.5476190476190477