In [2]:
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MultilabelClassificationEvaluator
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql import Window as w
from graphframes import GraphFrame
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [3]:
spark = (
    SparkSession.builder.config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.cores.max", "2")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [25]:
label_df = spark.read.parquet('./dataset.parquet')


def compute_weights(df, column="label"):
    w_zero = 1 / df.filter(f.col(column) == 0).count()
    w_one = 1 / df.filter(f.col(column) == 1).count()
    return df.withColumn("weights", f.when(f.col(column) == 0, w_zero).otherwise(w_one))

label_df = compute_weights(label_df)
label_df.toPandas()

Unnamed: 0,features,label,weights
0,"[0.3006134969325154, 1.0, 0.30508474576271183,...",1,0.000232
1,"[0.5343137254901961, 1.0, 0.13888888888888884,...",1,0.000232
2,"[0.1659751037344398, 1.0, 0.4626865671641791, ...",1,0.000232
3,"[0.7591240875912408, 1.0, 0.6666666666666667, ...",1,0.000232
4,"[0.4024896265560166, 1.0, 0.2457627118644068, ...",1,0.000232
...,...,...,...
117301,"[0.2063037249283668, 1.0, 0.0847457627118644, ...",0,0.000009
117302,"[0.2063037249283668, 1.0, 0.0847457627118644, ...",0,0.000009
117303,"[0.45517241379310347, 1.0, 0.5245901639344263,...",0,0.000009
117304,"[0.20343839541547282, 1.0, 0.0847457627118644,...",0,0.000009


3. Machine Learning Magic Bitch

In [26]:
model = LinearSVC(featuresCol='features', labelCol='label', weightCol='weights',maxIter=1000)
param_grid = ParamGridBuilder().addGrid(model.threshold, [0.1,0.25,0.5,0.75]).build()
cvs_1 = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', weightCol='weights'),\
                           numFolds=4)
cvs_2 = CrossValidator(estimator=model,
                           estimatorParamMaps=param_grid,
                           evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', weightCol='weights', metricName="areaUnderPR"),\
                           numFolds=4)

In [27]:
def stratified_split_train_test(df, frac, label, seed=42):
    """ stratfied split of a dataframe in train and test set."""
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.exceptAll(df_frac)
    return df_frac, df_remaining

training_set, test_set = stratified_split_train_test(df=label_df, frac=0.8, label="label")
#grid_search, hyperpar tuning...
test_set.toPandas()

Unnamed: 0,features,label,weights
0,"[0.75177304964539, 1.0, 0.9030470914127424, 1....",1,0.000232
1,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0000001154722...",1,0.000232
2,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0000001154722...",1,0.000232
3,"[0.9819819819819819, 1.0, 0.9938080495356038, ...",1,0.000232
4,"[0.9819819819819819, 1.0, 0.9938080495356038, ...",1,0.000232
...,...,...,...
23452,"[0.2603305785123967, 0.0, 0.10089020771513357,...",0,0.000009
23453,"[0.2603305785123967, 0.0, 0.10089020771513357,...",0,0.000009
23454,"[0.2603305785123967, 0.0, 0.10089020771513357,...",0,0.000009
23455,"[0.1709401709401709, 0.0, 0.1454005934718101, ...",0,0.000009


In [28]:
training_set.toPandas()

Unnamed: 0,features,label,weights
0,"[0.3006134969325154, 1.0, 0.30508474576271183,...",1,0.000232
1,"[0.5343137254901961, 1.0, 0.13888888888888884,...",1,0.000232
2,"[0.7591240875912408, 1.0, 0.6666666666666667, ...",1,0.000232
3,"[0.4024896265560166, 1.0, 0.2457627118644068, ...",1,0.000232
4,"[0.4311594202898551, 1.0, 1.0, 0.0, 0.22727272...",1,0.000232
...,...,...,...
93844,"[0.22499999999999998, 1.0, 0.0847457627118644,...",0,0.000009
93845,"[0.2063037249283668, 1.0, 0.0847457627118644, ...",0,0.000009
93846,"[0.2063037249283668, 1.0, 0.0847457627118644, ...",0,0.000009
93847,"[0.45517241379310347, 1.0, 0.5245901639344263,...",0,0.000009


In [None]:
estimator_1 = cvs_1.fit(training_set)
estimator_2 = cvs_2.fit(training_set)

In [29]:
prediction_1 = estimator_1.transform(test_set).select('label','prediction')
prediction_2 = estimator_2.transform(test_set).select('label','prediction')
prediction_1.groupby("prediction").count().toPandas()
prediction_2.groupby("prediction").count().toPandas()

Unnamed: 0,prediction,count
0,0.0,22047
1,1.0,1410


In [30]:
accuracy = prediction_1.filter(f.col('label')==f.col('prediction').cast(t.IntegerType())).count() / prediction_1.count()
print("Accuracy: ", accuracy)
p = prediction_1.filter("label==1 AND prediction==1").count() / prediction_1.filter('prediction==1').count()
r = prediction_1.filter("label==1 AND prediction==1").count() / prediction_1.filter('label == 1').count()
f1 = 2*p*r/(p+r)
print("F1-1 score: ", f1)


accuracy = prediction_2.filter(f.col('label')==f.col('prediction').cast(t.IntegerType())).count() / prediction_2.count()
print("Accuracy: ", accuracy)
p = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('prediction==1').count()
r = prediction_2.filter("label==1 AND prediction==1").count() / prediction_2.filter('label == 1').count()
f1 = 2*p*r/(p+r)
print("F1-2 score: ", f1)

Accuracy:  0.9574114336871723
F1-1 score:  0.5986339895540378
Accuracy:  0.9667476659419363
F1-2 score:  0.648331830477908


In [None]:
estimator_2.save("model.model")
!zip -r model.zip model.model
