### Install packages

In [1]:
!pip install python-binance

You should consider upgrading via the 'python -m pip install --upgrade pip' command.


### Imports

In [2]:
import pandas as pd
import numpy as np
import os.path
import math
import argparse
import json
from binance.client import Client
from datetime import datetime
from dateutil import parser as par
from sklearn import preprocessing

In [5]:
### Parameters

In [6]:
#Model Variables
history_points = 14

## Util

### CSV To Dataset

In [0]:
def csv_to_dataset(csv_path):
    data = pd.read_csv(csv_path)

    #remove columns that are not needed
    #timestamp, close_time, quote_av, trades, tb_base_av, tb_quote_av, ignore
    data = data.drop('timestamp', axis=1)
    data = data.drop('close_time', axis=1)
    data = data.drop('quote_av', axis=1)
    data = data.drop('trades', axis=1)
    data = data.drop('tb_base_av', axis=1)
    data = data.drop('tb_quote_av', axis=1)
    data = data.drop('ignore', axis=1)

    #Split the data into train and test sets before normalization
    test_split = 0.9
    n = int(len(data) * test_split)

    d_train = data[:n]
    d_test = data[n:]
    #d_train, d_test = train_test_split(data, train_size=train_size, test_size=test_size, shuffle=False)

    #Scale the data. Test data is scaled separately but with the same scaling parameters as the test data
    data_normaliser = preprocessing.MinMaxScaler()
    train_normalised = data_normaliser.fit_transform(d_train)
    test_normalised = data_normaliser.transform(d_test)

    #Get the data ready for model consumption
    #using the last {history_points} open high low close volume data points, predict the next open value
    #ohlcv_histories_normalised is a three dimensional array of size (len(data_normalised), history points, 5(open, high, low, close, volume))
    #Each item in the list is an array of 50 days worth of ohlcv
    ohlcv_train = np.array([train_normalised[i:i + history_points].copy() for i in range(len(train_normalised) - history_points)])
    ohlcv_test = np.array([test_normalised[i:i + history_points].copy() for i in range(len(test_normalised) - history_points)])

    #What is being predicted, will be compared for error
    #ndov = next day open values
    ndov_train_normalised = np.array([train_normalised[:, 0][i + history_points].copy() for i in range(len(train_normalised) - history_points)])
    ndov_train_normalised = np.expand_dims(ndov_train_normalised, -1)

    ndov_test_normalised = np.array([test_normalised[:, 0][i + history_points].copy() for i in range(len(test_normalised) - history_points)])
    ndov_test_normalised = np.expand_dims(ndov_test_normalised, -1)

    #Unormalised data for plotting later
    ndov_train = np.array([d_train['open'][i + history_points].copy() for i in range(len(d_train) - history_points)])
    ndov_train = np.expand_dims(ndov_train, -1)

    ndov_test = np.array([d_test['open'][i + len(d_train) + history_points].copy() for i in range(len(d_test) - history_points)])
    ndov_test = np.expand_dims(ndov_test, -1)

    #Testing with delta price


    #Variables to scale the data back up later
    y_normaliser = preprocessing.MinMaxScaler()
    y_train = y_normaliser.fit_transform(ndov_train)
    y_test = y_normaliser.transform(ndov_test)

    #tis - technical indicators
    tis_train = []
    tis_test = []
    for his in ohlcv_train:
        # note since we are using his[3] we are taking the SMA of the closing price
        sma = np.mean(his[:, 3])
        tis_train.append(np.array([sma]))
    for his in ohlcv_test:
        sma = np.mean(his[:, 3])
        tis_test.append(np.array([sma]))

    tis_train = np.array(tis_train)
    tis_test = np.array(tis_test)

    indicator_normaliser = preprocessing.MinMaxScaler()
    tis_normalised_train = indicator_normaliser.fit_transform(tis_train)
    tis_normalised_test = indicator_normaliser.transform(tis_test)

    #assert ohlcv_histories_normalised.shape[0] == next_day_open_values_normalised.shape[0] == technical_indicators_normalised.shape[0]
    #return ohlcv_histories_normalised, technical_indicators_normalised, next_day_open_values_normalised, next_day_open_values, y_normaliser

    return ohlcv_train, \
        ohlcv_test, \
        ndov_test, \
        y_train, \
        y_test, \
        y_normaliser, \
        tis_normalised_train, \
        tis_normalised_test

### Save Data To CSV

In [0]:
def get_all_binance(symbol, kline_size, data_type, save = True):
    #https://medium.com/swlh/retrieving-full-historical-data-for-every-cryptocurrency-on-binance-bitmex-using-the-python-apis-27b47fd8137f

    def time_differencing(td_data_df):
      for i in td_data_df.index:
        if i < len(td_data_df)-1:
            td_data_df.loc[i,"td"] = float(td_data_df.loc[i+1,"open"]) - float(td_data_df.loc[i, "open"])
        else:
            td_data_df.loc[i,'td'] = 0
      return td_data_df

    #Only download new data instead of downloading the entire set every runtime
    def minutes_of_new_data(symbol, kline_size, data, client, source):
        if len(data) > 0:
            old = par.parse(data["timestamp"].iloc[-1])
        elif source == "binance":
            old = datetime.strptime('1 Jan 2017', '%d %b %Y')
        if source == "binance":
            new = pd.to_datetime(binance_client.get_klines(symbol=symbol, interval=kline_size)[-1][0], unit='ms')
        return old, new 
    
    ### API
    binance_api_key = "JHiOfaAiYmC2uQVeaPPlNZH1SGZCazrRKX1wDU0ahBIq5omUPoWbE73yWXx663PW"
    binance_api_secret = "R82RLAUqbsvBmuy9e58U5QU8GFWDgvDVJx2oc6bLx4n4MQ5qEguFYHeoNY5uoMyz"
    binance_client = Client(api_key=binance_api_key, api_secret=binance_api_secret)

    #Variables
    binsizes = {"1m": 1, "5m": 5, "1h": 60, "1d": 1440}
    if data_type!="td":
        filename = '%s-%s-data.csv' % (symbol, kline_size)
    else:
        filename = '%s-%s-data-td.csv' % (symbol, kline_size)

    #If file exists, read it, otherwise create new object
    if os.path.isfile(filename):
        data_df = pd.read_csv(filename)
    else:
        data_df = pd.DataFrame()
    
    #Perform checks for only downloading new data
    oldest_point, newest_point = minutes_of_new_data(symbol, kline_size, data_df, binance_client, source = "binance")
    if data_type == "reg":
        delta_min = (newest_point - oldest_point).total_seconds()/60
        available_data = math.ceil(delta_min/binsizes[kline_size])
        if oldest_point == datetime.strptime('1 Jan 2017', '%d %b %Y'):
            print('Downloading all available %s data for %s. Be patient..!' % (kline_size, symbol))
        else:
            print('Downloading %d minutes of new data available for %s, i.e. %d instances of %s data.' % (delta_min, symbol, available_data, kline_size))
    elif data_type == "td":
        delta_day = (newest_point - oldest_point).total_seconds()/86400
        available_data = math.ceil(delta_day/binsizes[kline_size])
        if oldest_point == datetime.strptime('1 Jan 2017', '%d %b %Y'):
            print('Downloading all available %s data for %s. Be patient..!' % (kline_size, symbol))
        else:
            print('Downloading %d days of new data available for %s, i.e. %d instances of %s data.' % (delta_day, symbol, available_data, kline_size))
    
    klines = binance_client.get_historical_klines(symbol, kline_size, oldest_point.strftime("%d %b %Y %H:%M:%S"), newest_point.strftime("%d %b %Y %H:%M:%S"))
    data = pd.DataFrame(klines, columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_av', 'trades', 'tb_base_av', 'tb_quote_av', 'ignore' ])
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')

    if len(data_df) > 0:
        if(oldest_point != newest_point):
            temp_df = pd.DataFrame(data)
            data_df = data_df.append(temp_df, sort=False)
    else:
        data_df = data

    if data_type == "td":
        #Create column allowing for referincing index by an int
        x = 0
        for i in data_df.index:
            data_df.loc[i, 'counter'] = x
            x = x+1

        data_df.set_index('counter', inplace=True)
    else:
        data_df.set_index('timestamp', inplace=True)

    if save:
        if data_type == "td": #Saved format will be time differenced
            data_df = time_differencing(data_df)
        data_df.to_csv(filename)
        
    print('All caught up..!')

# Model

# Main

### Download latest data to csv

In [0]:
##Environment variables
symbol = "ETHUSDT"      #Crypto pair on binance
kline_size = "1d"       #1h or 1d
data_type = "td"        #reg or td
save = True             #Save data to csv file

get_all_binance(symbol, kline_size, data_type, save)

Downloading all available 1d data for ETHUSDT. Be patient..!
All caught up..!


### Train Model