In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Crypto Data").getOrCreate()
df = spark.read.csv("/home/jovyan/data/data_train_test.csv", inferSchema=True, encoding='utf-8', header=True).cache()
df_predict = spark.read.csv("/home/jovyan/data/data_predict.csv", inferSchema=True, encoding='utf-8', header=True).cache()

In [3]:
# Get temp columns results
tmp_df = df.drop('_c0', 'time', 'symbol', 'price+24h-avg').cache()
feature_cols = [col for col in tmp_df.columns if col != "label"]

In [21]:
# Class Models for the Pipeline
vec_assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
scaler = MinMaxScaler(min=0.0, max=1.0, inputCol='features', outputCol='scaled-features')
log_reg_model = LogisticRegression(featuresCol='scaled-features', labelCol='label')

In [22]:
train_df, test_df = df.randomSplit([0.7,0.3])

In [23]:
pipeline = Pipeline(stages=[vec_assembler, scaler, log_reg_model])

In [27]:
%%time
paramGrid = ParamGridBuilder() \
    .addGrid(log_reg_model.regParam, [0.1, 0.01, 0.001]) \
    .addGrid(log_reg_model.maxIter, [1000, 10000]) \
    .addGrid(log_reg_model.elasticNetParam, [0,1]) \
    .build()

# paramGrid = ParamGridBuilder() \
#     .addGrid(log_reg_model.regParam, [0.1, 0.01, 0.001]) \
#     .addGrid(log_reg_model.maxIter, [1000, 10000, 100000]) \
#     .addGrid(log_reg_model.elasticNetParam, [0,1]) \
#     .build()

In [28]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_df)

In [29]:
prediction = cvModel.transform(test_df)

In [30]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')

In [31]:
AUC = evaluator.evaluate(prediction)
AUC

0.5791036627952272

In [32]:
real_pred = cvModel.transform(df_predict)

In [33]:
real_pred.select('symbol', 'time', 'price-0h-avg', 'probability', 'prediction').show()

+------+-------------------+------------+--------------------+----------+
|symbol|               time|price-0h-avg|         probability|prediction|
+------+-------------------+------------+--------------------+----------+
|   EOS|2018-06-12 08:00:00|       11.08|[0.06462750925874...|       1.0|
|   LTC|2018-06-12 08:00:00|      107.22|[0.08884317151617...|       1.0|
|   ETH|2018-06-12 08:00:00|      529.94|[0.06284924375329...|       1.0|
|   BCH|2018-06-12 08:00:00|      945.82|[0.06985228733208...|       1.0|
|   VEN|2018-06-12 08:00:00|         3.3|[0.09359866535352...|       1.0|
|  DRGN|2018-06-12 08:00:00|      0.4429|[0.09374802026645...|       1.0|
|  NANO|2018-06-12 08:00:00|        3.14|[0.09368749162919...|       1.0|
|   XLM|2018-06-12 08:00:00|      0.2495|[0.09294354742172...|       1.0|
|   CVC|2018-06-12 08:00:00|      0.2393|[0.09346231433497...|       1.0|
|   XRP|2018-06-12 08:00:00|      0.5897|[0.08973235991389...|       1.0|
|   ADA|2018-06-12 08:00:00|      0.17

In [34]:
spark.catalog.clearCache()