In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


In [2]:
spark = SparkSession.builder.appName("Crypto Data").getOrCreate()
df = spark.read.csv("/home/jovyan/data/data_train_test.csv", inferSchema=True, encoding='utf-8', header=True).cache()
df_predict = spark.read.csv("/home/jovyan/data/data_predict.csv", inferSchema=True, encoding='utf-8', header=True).cache()

In [3]:
# df.printSchema()

In [4]:
# Get temp columns results
tmp_df = df.drop('_c0', 'time', 'symbol', 'price+24h-max', 'price+24h-min', 'btc-price+24h-min').cache()
feature_cols = [col for col in tmp_df.columns if col != "label"]
tmp_df.unpersist()

DataFrame[price-0h: double, price-1h: double, price-2h: double, price-4h: double, price-5h: double, price-6h: double, price-8h: double, price-10h: double, price-12h: double, price-24h: double, price-48h: double, price-96h: double, price-192h: double, price-384h: double, price-768h: double, vol-0h: double, vol-1h: double, vol-2h: double, vol-4h: double, vol-6h: double, vol-8h: double, vol-10h: double, vol-16h: double, vol-24h: double, vol-48h: double, vol-96h: double, vol-192h: double, vol-384h: double, vol-768h: double, btc-price-0h: double, btc-price-1h: double, btc-price-2h: double, btc-price-4h: double, btc-price-5h: double, btc-price-6h: double, btc-price-8h: double, btc-price-10h: double, btc-price-12h: double, btc-price-24h: double, btc-price-48h: double, btc-price-96h: double, btc-price-192h: double, btc-price-384h: double, btc-price-768h: double, btc-vol-0h: double, btc-vol-1h: double, btc-vol-2h: double, btc-vol-4h: double, btc-vol-6h: double, btc-vol-8h: double, btc-vol-10h: 

In [5]:
train_df, test_df = df.randomSplit([0.90,0.1])

In [6]:
# Class Models for the Pipeline
vec_assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
scaler = MinMaxScaler(min=0.0, max=1.0, inputCol='features', outputCol='scaled-features')
log_reg_model = LogisticRegression(featuresCol='scaled-features', labelCol='label')

In [7]:
pipeline = Pipeline(stages=[vec_assembler, scaler, log_reg_model])

In [8]:
# paramGrid = ParamGridBuilder() \
#     .addGrid(log_reg_model.regParam, [0.1,0.005,0.5]) \
#     .addGrid(log_reg_model.maxIter, [100,1000,10000]) \
#     .addGrid(log_reg_model.elasticNetParam, [0, 1]) \
#     .addGrid(log_reg_model.fitIntercept, [True, False]) \
#     .addGrid(log_reg_model.aggregationDepth, [2,3,4]) \
#     .build()
    
paramGrid = ParamGridBuilder() \
    .build()

In [9]:
%%time
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_df)

CPU times: user 130 ms, sys: 20 ms, total: 150 ms
Wall time: 48.7 s


In [10]:
prediction = cvModel.transform(test_df)

In [11]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction')

In [12]:
#0.313349279044155
AUC = evaluator.evaluate(prediction)
AUC

0.3874069170432073

In [13]:
predictionAndLabels = prediction.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.486031885329823


In [14]:
real_pred = cvModel.transform(df_predict)

In [15]:
# df_predict.show()

```
                         LABEL
        delta <= -10%  =   
-10%  < delta <=  -5%  =   
 -5%  < delta <=   0%  =   
  0%  < delta <=   5%  =   
  5%  < delta <=  10%  =   
 10% <= delta          =   
```

In [16]:
real_pred.select('symbol', 'time', 'price-0h', 'prediction').show(truncate=False)

+------+-------------------+-------------------+----------+
|symbol|time               |price-0h           |prediction|
+------+-------------------+-------------------+----------+
|EOS   |2018-06-17 14:00:00|10.4425            |3.0       |
|LTC   |2018-06-17 14:00:00|95.5225            |3.0       |
|ETH   |2018-06-17 14:00:00|498.3575           |3.0       |
|BCH   |2018-06-17 14:00:00|849.07             |3.0       |
|VEN   |2018-06-17 14:00:00|3.16               |3.0       |
|XLM   |2018-06-17 14:00:00|0.22845            |3.0       |
|CVC   |2018-06-17 14:00:00|0.20279999999999998|3.0       |
|XRP   |2018-06-17 14:00:00|0.5246             |3.0       |
+------+-------------------+-------------------+----------+



In [17]:
spark.catalog.clearCache()