In [2]:
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql import Window as w
from graphframes import GraphFrame
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [3]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [110]:
label_df = spark.read.parquet('./dataset.parquet')


def compute_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 1 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weights", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

label_df = compute_weights(label_df)
label_df.toPandas()

Unnamed: 0,features,label,weights
0,"[0.3006134969325154, 1.0, 0.30508474576271183,...",1,0.000232
1,"[0.5343137254901961, 1.0, 0.13888888888888884,...",1,0.000232
2,"[0.1659751037344398, 1.0, 0.4626865671641791, ...",1,0.000232
3,"[0.7591240875912408, 1.0, 0.6666666666666667, ...",1,0.000232
4,"[0.4024896265560166, 1.0, 0.2457627118644068, ...",1,0.000232
...,...,...,...
117301,"[0.2063037249283668, 1.0, 0.0847457627118644, ...",0,0.000009
117302,"[0.2063037249283668, 1.0, 0.0847457627118644, ...",0,0.000009
117303,"[0.45517241379310347, 1.0, 0.5245901639344263,...",0,0.000009
117304,"[0.20343839541547282, 1.0, 0.0847457627118644,...",0,0.000009


3. Machine Learning Magic Bitch

In [75]:
import numpy as np
"""
ths = np.linspace(0.9,0.99,num=10)
model = LinearSVC(featuresCol='features', labelCol='label', weightCol='weights',maxIter=100)
param_grid = ParamGridBuilder().addGrid(model.threshold, ths).build()
cvs_2 = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', weightCol='weights', metricName="areaUnderPR"),\
                           numFolds=4)
"""

In [111]:
def stratified_split_train_test(df, frac, label, seed=42):
    """ stratfied split of a dataframe in train and test set."""
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

#training_set, test_set = stratified_split_train_test(df=label_df, frac=0.8, label="label")
#grid_search, hyperpar tuning...
ratio = label_df.filter('label==1').count()/label_df.filter('label==0').count()

zeros_df = label_df.filter('label==0').sample(False, ratio)
label_df = label_df.filter('label==1').unionAll(zeros_df)
training_set, test_set = stratified_split_train_test(df=label_df,label='label',frac=0.8,seed=42)
model = LinearSVC(featuresCol='features', labelCol='label', weightCol='weights',maxIter=100, threshold=0.99)

0.03808782145448753


In [112]:
estimator_2 = model.fit(training_set)

In [113]:
prediction_2 = estimator_2.transform(test_set).select('label','prediction')

In [114]:
accuracy = prediction_2.filter(f.col('label')==f.col('prediction').cast(t.IntegerType())).count() / prediction_2.count()
print("Accuracy: ", accuracy)
p = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('prediction==1').count()
r = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('label == 1').count()
f1 = 2*p*r/(p+r)
print("F1-2 score: ", f1)

Accuracy:  0.8206521739130435
F1-2 score:  0.8419372006386375


In [115]:
estimator_2.save("model.model")
!zip -r model.zip model.model

updating: model.model/ (stored 0%)
updating: model.model/metadata/ (stored 0%)
updating: model.model/metadata/.part-00000.crc (stored 0%)
updating: model.model/metadata/part-00000 (deflated 44%)
updating: model.model/metadata/._SUCCESS.crc (stored 0%)
updating: model.model/metadata/_SUCCESS (stored 0%)
  adding: model.model/data/ (stored 0%)
  adding: model.model/data/part-00000-fe3abaf0-96d4-4c97-b73a-42936d954220-c000.snappy.parquet (deflated 54%)
  adding: model.model/data/.part-00000-fe3abaf0-96d4-4c97-b73a-42936d954220-c000.snappy.parquet.crc (stored 0%)
  adding: model.model/data/._SUCCESS.crc (stored 0%)
  adding: model.model/data/_SUCCESS (stored 0%)


In [90]:
estimator_2.bestModel.getThreshold()

AttributeError: 'LinearSVCModel' object has no attribute 'bestModel'