In [8]:
from main import download_data
CRYPTOS = ['BTC', 'ETH', 'EOS', 'LTC', 'XRP', 'BCH', 'ADA', 'XLM', 'CVC']

In [112]:
download_data(CRYPTOS, currency='USD')

Querying for BTC..................................
Querying for ETH............
Querying for EOS....
Querying for LTC....................
Querying for XRP..............
Querying for BCH...
Querying for ADA.
Querying for XLM.......
Querying for CVC...
Saving file to /home/jovyan/data/data_raw.csv


In [1]:
from datetime import datetime, timedelta
from typing import List, Tuple
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("Crypto Data").getOrCreate()
df = spark.read.csv("/home/jovyan/data/data_raw.csv", inferSchema=True, encoding='utf-8', header=True)
df.printSchema()

root
 |-- close: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- open: double (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- volumefrom: double (nullable = true)
 |-- volumeto: double (nullable = true)
 |-- symbol: string (nullable = true)



In [3]:
df.show(3, truncate=False)

+-----+-----+-----+-----+-------------------+----------+--------+------+
|close|high |low  |open |time               |volumefrom|volumeto|symbol|
+-----+-----+-----+-----+-------------------+----------+--------+------+
|0.061|0.061|0.061|0.061|2010-09-07 06:00:00|0.0       |0.0     |BTC   |
|0.061|0.061|0.061|0.061|2010-09-07 07:00:00|0.0       |0.0     |BTC   |
|0.061|0.061|0.061|0.061|2010-09-07 08:00:00|0.0       |0.0     |BTC   |
+-----+-----+-----+-----+-------------------+----------+--------+------+
only showing top 3 rows



In [4]:
# checking that the latest data lines up between all coins
df.groupBy('symbol').agg({'time':'max'}).show()

+------+-------------------+
|symbol|          max(time)|
+------+-------------------+
|   EOS|2018-06-11 23:00:00|
|   LTC|2018-06-11 23:00:00|
|   ETH|2018-06-11 23:00:00|
|   BCH|2018-06-11 23:00:00|
|   BTC|2018-06-11 23:00:00|
|   XLM|2018-06-11 23:00:00|
|   CVC|2018-06-11 23:00:00|
|   XRP|2018-06-11 23:00:00|
|   ADA|2018-06-11 23:00:00|
+------+-------------------+



In [5]:
df = df.withColumn("trade_volume", F.abs(df['volumefrom']-df['volumeto']))
df = df.drop('volumefrom', 'volumeto', 'close', 'low', 'open')
df.groupBy('symbol').agg({'time':'max'}).show()

+------+-------------------+
|symbol|          max(time)|
+------+-------------------+
|   EOS|2018-06-11 23:00:00|
|   LTC|2018-06-11 23:00:00|
|   ETH|2018-06-11 23:00:00|
|   BCH|2018-06-11 23:00:00|
|   BTC|2018-06-11 23:00:00|
|   XLM|2018-06-11 23:00:00|
|   CVC|2018-06-11 23:00:00|
|   XRP|2018-06-11 23:00:00|
|   ADA|2018-06-11 23:00:00|
+------+-------------------+



In [6]:
# new colums for the new df
DELTA_MATCH = {
    'price': {
        'price+24h-avg': 24,
        'price-0h-avg': 0,
        'price-1h-avg': -1,
        'price-2h-avg': -2,
        'price-4h-avg': -4,
        'price-5h-avg': -5,
        'price-6h-avg': -6,
        'price-8h-avg': -8,
        'price-10h-avg': -10,
        'price-12h-avg': -12,
        'price-24h-avg': -24,
        'price-48h-avg': -48,
        'price-96h-avg': -96,
        'price-192h-avg': -192
    },
    'volume': {
        'vol-0h-avg': 0,
        'vol-1h-avg': -1,
        'vol-2h-avg': -2,
        'vol-4h-avg': -4,
        'vol-6h-avg': -6,
        'vol-8h-avg': -8,
        'vol-10h-avg': -10,
        'vol-16h-avg': -16,
        'vol-24h-avg': -24,
        'vol-48h-avg': -48,
        'vol-96h-avg': -96,
        'vol-192h-avg': -192
    }
}

In [9]:
# create and fill columns according to the historic data (in hours timedelta) defined in the PAST_DELTAS dict
symb_window = Window.partitionBy("symbol").orderBy("time")
for symb in CRYPTOS:
    for k, v in DELTA_MATCH['price'].items():
        df = df.withColumn(k, F.lag(df['high'], v*-1).over(symb_window))
    for k, v in DELTA_MATCH['volume'].items():
        df = df.withColumn(k, F.lag(df['trade_volume'], v*-1).over(symb_window))

In [10]:
# Remove all the (expected NULL values from the previous operation)
for k, v in DELTA_MATCH.items():
    df = df.filter(df[list(DELTA_MATCH[k].keys())[-1]].isNotNull()).cache()

In [11]:
# check if the order is still order by time (the last row must
# match the hour of the extract)
latest_received_date = df.collect()[-1]['time']

In [12]:
# isolate BTC data (since we don't have an aggregate total market info)
df_btc = df.filter(df['symbol'] == "BTC").cache()

In [13]:
# rename columns in preprapation for future join
df_btc = df_btc.drop('symbol', 'price+24h-avg', 'high', 'trade_volume').cache()

In [14]:
for col in df_btc.columns:
    if col == "time": continue
    df_btc = df_btc.withColumnRenamed(col, "btc-{}".format(col)).cache()

In [15]:
# isolate the non-btc data to prepare for join
df_exbtc = df.filter(df['symbol'] != "BTC").cache()

In [16]:
# join dfs so all coins have BTC in their row for benchmark for all
df_joined = df_exbtc.join(df_btc, 'time', 'inner').cache()

In [17]:
df_exbtc.unpersist()
df_btc.unpersist()
print(df_exbtc.is_cached)
print(df_btc.is_cached)

False
False


In [18]:
# Check if the join was correctly executed (same BTC price must appear for all the coins.)
df_joined.select(['high', 'symbol', 'btc-price-0h-avg', 'time']).filter(df_joined['time'] == latest_received_date).show()

+------+------+----------------+-------------------+
|  high|symbol|btc-price-0h-avg|               time|
+------+------+----------------+-------------------+
| 0.177|   ADA|         6813.43|2018-06-11 23:00:00|
|0.5932|   XRP|         6813.43|2018-06-11 23:00:00|
|0.2417|   CVC|         6813.43|2018-06-11 23:00:00|
|0.2475|   XLM|         6813.43|2018-06-11 23:00:00|
|953.13|   BCH|         6813.43|2018-06-11 23:00:00|
|526.99|   ETH|         6813.43|2018-06-11 23:00:00|
|106.16|   LTC|         6813.43|2018-06-11 23:00:00|
| 11.33|   EOS|         6813.43|2018-06-11 23:00:00|
+------+------+----------------+-------------------+



In [19]:
# drop unnecessary/duplicate cols
df_joined = df_joined.drop('high', 'trade_volume', 'btc_high', 'btc-price+24h-avg')

In [20]:
df_predict = df_joined.filter(df_joined['time'] == latest_received_date)

In [21]:
# calculate delta
df_joined = df_joined.withColumn('24h-delta', (df['price+24h-avg'] / df['price-0h-avg']) - 1).cache()

In [22]:
# create a new col with the round delta and eliminate the previous 24h-delta
# df_joined = df_joined.withColumn('delta-round', F.round('24h-delta', 2)).drop('24h-delta').cache()

In [23]:
def categorize_delta(delta) -> int:
    """
    UDF to help categorize growth of a specific crypto currency.
    param: delta - the growth in % we are evaluating
    param: threshold - threshold separating 0 from 1
    """
    if delta is None:
        return None
#     elif delta < 0.04:    # the parameter giving us the actual binary classification answer in the actual data
    elif delta < -0.1:    # the parameter giving us the actual binary classification answer in the actual data
        return 0
    else:
        return 1
    
# register the UDF
categorize_delta_udf = F.udf(categorize_delta, IntegerType())

In [24]:
df_categorized = df_joined.withColumn('label', categorize_delta_udf(df_joined['24h-delta'])).drop('24h-delta').cache().cache()
df_joined.unpersist()
df_joined.is_cached

False

In [25]:
df_categorized.select('symbol', 'time', 'label').filter(df_categorized['label'].isNull()).count()

192

In [26]:
# remove the (last) column thas null data, in this case, label should have 192 rows
df_categorized = df_categorized.filter(df_categorized['label'].isNotNull()).cache()

In [27]:
df_categorized.toPandas().to_csv('/home/jovyan/data/data_train_test.csv')

In [28]:
df_predict.toPandas().to_csv('/home/jovyan/data/data_predict.csv')

In [29]:
spark.catalog.clearCache()