In [78]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.sql import SparkSession

In [79]:
spark = SparkSession.builder.appName("Crypto Data").getOrCreate()
df = spark.read.csv("/home/jovyan/data/data_rework_categorized.csv", inferSchema=True, encoding='utf-8', header=True).cache()

In [80]:
# Rename category column to label
df = df.withColumnRenamed("category", "label").cache()

In [81]:
# Get temp columns results
tmp_df = df.drop('_c0', 'time', 'symbol', 'delta-round').cache()
feature_cols = [col for col in tmp_df.columns if col != "label"]

In [82]:
# Class Models for the Pipeline
vec_assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
scaler = MinMaxScaler(min=0.0, max=1.0, inputCol='features', outputCol='scaled-features')
log_reg_model = LogisticRegression(featuresCol='scaled-features', labelCol='label', maxIter=1000, regParam=0.001)

In [83]:
train_df, test_df = df.randomSplit([0.8,0.2])

In [84]:
pipeline = Pipeline(stages=[vec_assembler, scaler, log_reg_model])

In [85]:
fitted_model = pipeline.fit(train_df)

In [86]:
prediction = fitted_model.transform(test_df)

In [87]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')

In [88]:
AUC = evaluator.evaluate(prediction)

In [90]:
AUC

0.6885879872188705

In [98]:
prediction.select(['rawPrediction', 'prediction', 'label']).filter(prediction['prediction'] == 1).count()

522

In [104]:
prediction.filter(prediction['label']==1).count()

6472