In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
from pyspark.sql import SparkSession
from sklearn import datasets
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
spark = SparkSession.builder.appName("Crypto Data").getOrCreate()
df_categorized = spark.read.csv("/home/jovyan/data/data_train_test.csv", inferSchema=True, encoding='utf-8', header=True).cache()
df_predict = spark.read.csv("/home/jovyan/data/data_predict.csv", inferSchema=True, encoding='utf-8', header=True).cache()

In [3]:
df_categorized = df_categorized.filter(df_categorized['time'] >= dt(2017, 1, 1)).cache()

In [4]:
df_categorized.filter(df_categorized['time'] >= dt(2017,1,1)).groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 5233|
|    3|26065|
|    4|11444|
|    2|39205|
+-----+-----+



In [5]:
print(df_categorized.filter(df_categorized['time'] >= dt(2017,1,1)).count())
print(df_categorized.filter(df_categorized['time'] >= dt(2018,1,1)).count())

81947
34808


In [6]:
labels = df_categorized.select('label').toPandas()
features = df_categorized.drop('_c0', 'time', 'symbol', 'label', 'price+6h-high', 'close', 'btc-close', 'volumefrom', 'volumeto', 'btc-volumefrom', 'btc-volumeto').toPandas()

In [7]:
features.columns

Index(['price-0h', 'price-1h', 'price-2h', 'price-4h', 'price-5h', 'price-6h',
       'price-8h', 'price-10h', 'price-12h', 'price-24h', 'price-48h',
       'price-96h', 'price-192h', 'price-384h', 'price-768h', 'fromvol-0h',
       'fromvol-1h', 'fromvol-2h', 'fromvol-4h', 'fromvol-6h', 'fromvol-8h',
       'fromvol-10h', 'fromvol-16h', 'fromvol-24h', 'fromvol-48h',
       'fromvol-96h', 'fromvol-192h', 'fromvol-384h', 'fromvol-768h',
       'btc-volumefrom', 'btc-volumeto', 'btc-price-0h', 'btc-price-1h',
       'btc-price-2h', 'btc-price-4h', 'btc-price-5h', 'btc-price-6h',
       'btc-price-8h', 'btc-price-10h', 'btc-price-12h', 'btc-price-24h',
       'btc-price-48h', 'btc-price-96h', 'btc-price-192h', 'btc-price-384h',
       'btc-price-768h', 'btc-fromvol-0h', 'btc-fromvol-1h', 'btc-fromvol-2h',
       'btc-fromvol-4h', 'btc-fromvol-6h', 'btc-fromvol-8h', 'btc-fromvol-10h',
       'btc-fromvol-16h', 'btc-fromvol-24h', 'btc-fromvol-48h',
       'btc-fromvol-96h', 'btc-fromvol-192

In [8]:
len(features.columns)

60

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                    test_size=0.2, 
                                                    random_state=4567, 
                                                    stratify=labels)

In [10]:
scaler = MinMaxScaler()
standscaler = StandardScaler()
mlpmodel = MLPClassifier(hidden_layer_sizes=(400, 300, 200, 100, 10), solver='adam', activation='relu', learning_rate='adaptive', verbose=True, tol=0.0001)

In [11]:
pipe = Pipeline([
    ('scaler', scaler),
    ('stdscale', standscaler),
    ('model', mlpmodel)
])

In [12]:
param_grid = {
    'model__solver': ['adam'],
    'model__activation': ['relu', 'logistic'],
    'model__hidden_layer_sizes': [(400, 300, 200, 100, 10)],
    'model__verbose': [True]
}
# (100, 40, 40, 40) (400, 300, 200, 100, 10)

In [13]:
grid = GridSearchCV(pipe,cv=2,param_grid=param_grid, verbose=1)

In [14]:
%%time
predictor = grid.fit(X_train, y_train.values.ravel())

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Iteration 1, loss = 1.14645616
Iteration 2, loss = 1.11769938
Iteration 3, loss = 1.10721532
Iteration 4, loss = 1.09875653
Iteration 5, loss = 1.08839185
Iteration 6, loss = 1.07803149
Iteration 7, loss = 1.06570646
Iteration 8, loss = 1.05354429
Iteration 9, loss = 1.03983006
Iteration 10, loss = 1.02284519
Iteration 11, loss = 1.00115783
Iteration 12, loss = 0.98318235
Iteration 13, loss = 0.95929713
Iteration 14, loss = 0.93670174
Iteration 15, loss = 0.91141967
Iteration 16, loss = 0.89319255
Iteration 17, loss = 0.85845948
Iteration 18, loss = 0.83769615
Iteration 19, loss = 0.80903893
Iteration 20, loss = 0.78285564
Iteration 21, loss = 0.75643991
Iteration 22, loss = 0.72886208
Iteration 23, loss = 0.70906640
Iteration 24, loss = 0.68621992
Iteration 25, loss = 0.66455169
Iteration 26, loss = 0.64382755
Iteration 27, loss = 0.62856896
Iteration 28, loss = 0.60012344
Iteration 29, loss = 0.58696752
Iteration 30, loss = 0



Iteration 1, loss = 1.23902877
Iteration 2, loss = 1.16833600
Iteration 3, loss = 1.16788182
Iteration 4, loss = 1.16793294
Iteration 5, loss = 1.16787499
Iteration 6, loss = 1.16785774
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 16.1min finished


Iteration 1, loss = 1.13826965
Iteration 2, loss = 1.11011974
Iteration 3, loss = 1.09655785
Iteration 4, loss = 1.08502269
Iteration 5, loss = 1.07046899
Iteration 6, loss = 1.05407449
Iteration 7, loss = 1.03750628
Iteration 8, loss = 1.01660354
Iteration 9, loss = 0.99599278
Iteration 10, loss = 0.97215811
Iteration 11, loss = 0.94799745
Iteration 12, loss = 0.92339731
Iteration 13, loss = 0.89921703
Iteration 14, loss = 0.87674619
Iteration 15, loss = 0.85381935
Iteration 16, loss = 0.83195934
Iteration 17, loss = 0.80933013
Iteration 18, loss = 0.78923374
Iteration 19, loss = 0.77038056
Iteration 20, loss = 0.75183877
Iteration 21, loss = 0.73081348
Iteration 22, loss = 0.71314527
Iteration 23, loss = 0.70020127
Iteration 24, loss = 0.68424382
Iteration 25, loss = 0.66596422
Iteration 26, loss = 0.65393547
Iteration 27, loss = 0.63940407
Iteration 28, loss = 0.62521786
Iteration 29, loss = 0.61264481
Iteration 30, loss = 0.59885147
Iteration 31, loss = 0.59079939
Iteration 32, los

In [15]:
# title = "Learning Curves MLP"
# plot_learning_curve(grid, title, X_train, y_train.values.ravel(), ylim=(0.0, 1.0), cv=2, n_jobs=1)
# plt.show()

In [16]:
predictor.score(X_test, y_test)

0.53197071384990846

In [17]:
predictor.predict(X_test)

array([2, 2, 3, ..., 2, 3, 4])

In [18]:
predictor.best_params_

{'model__activation': 'relu',
 'model__hidden_layer_sizes': (400, 300, 200, 100, 10),
 'model__solver': 'adam',
 'model__verbose': True}

In [19]:
symbols = df_predict.select('symbol').distinct().collect()

In [20]:
pred_labels = {
    '1': 'decrease more than -4%',
    '2': 'decrease between -4% and 0%',
    '3': 'grow between 0% and 4%',
    '4': 'grow more than 4%'
}

In [21]:
print("At {}, these are the predictions:".format(df_predict.select(df_predict['time']).collect()[0][0]))
print("")
from IPython.core.display import display, HTML
for row in symbols:
    price = df_predict.filter(df_predict['symbol'] == row[0]).select('price-0h').collect()[0][0]
    df_row = df_predict.filter(df_predict['symbol'] == row[0]).drop('_c0', 'time', 'symbol', 'label', 'price+6h-high', '_c0', 'time', 'close', 'btc-close', 'volumefrom', 'volumeto', 'btc-volumefrom', 'btc-volumeto' ).toPandas()
    pred = predictor.predict(df_row)
    print("{} price was at {}$. \t In 6h might {}.".format(row[0], round(price,2), pred_labels[str(pred[0])]))
    

At 2018-07-01 12:00:00, these are the predictions:

EOS price was at 8.08$. 	 In 6h might grow between 0% and 4%.
LTC price was at 80.13$. 	 In 6h might decrease between -4% and 0%.
ETH price was at 454.29$. 	 In 6h might grow between 0% and 4%.
BCH price was at 741.46$. 	 In 6h might grow between 0% and 4%.
VEN price was at 2.63$. 	 In 6h might grow between 0% and 4%.
XLM price was at 0.2$. 	 In 6h might grow between 0% and 4%.
CVC price was at 0.18$. 	 In 6h might decrease between -4% and 0%.
XRP price was at 0.46$. 	 In 6h might decrease between -4% and 0%.
