# Introduction

This project is the capstone project of the [Udacity Datascience Nangodegree](https://www.udacity.com/course/data-scientist-nanodegree--nd025). It attempts to use ML techniques such as LSTMs to predict the prices of crypto currencies. The inputs to the algorithm will be trading data over a given data range and the prediction will be the adjusted close price.

The accompanying web app will allow the selection of a given cyrpto currency from a dropdown, a given input data range and a given algorithm. 


# Imports and helper functions

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import time
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import plotly.figure_factory as ff
import scipy.stats as stats
import plotly.offline as offline_py

from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from datetime import datetime

In [None]:
# Check TF 2.0
import tensorflow as tf
print(tf.__version__)

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras.layers import LSTM
from keras.layers import Dropout, Conv1D

In [None]:
color_scheme = {
    'index': '#B6B2CF',
    'etf': '#2D3ECF',
    'tracking_error': '#6F91DE',
    'df_header': 'silver',
    'df_value': 'white',
    'df_line': 'silver',
    'heatmap_colorscale': [(0, '#6F91DE'), (0.5, 'grey'), (1, 'red')],
    'background_label': '#9dbdd5',
    'low_value': '#B6B2CF',
    'high_value': '#2D3ECF',
    'y_axis_2_text_color': 'grey',
    'shadow': 'rgba(0, 0, 0, 0.75)',
    'major_line': '#2D3ECF',
    'minor_line': '#B6B2CF',
    'main_line': 'black'}

def generate_config():
    return {'showLink': False, 'displayModeBar': False, 'showAxisRangeEntryBoxes': True}

def _generate_traces(name_df_color_data):
    traces = []

    for name, df, color in name_df_color_data:
        traces.append(go.Scatter(
            name=name,
            x=df.index,
            y=df,
            mode='lines',
            line={'color': color}))

    return traces

def resample(open_prices, high_prices, low_prices, close_prices, period='H'):
    """Converts daily OHLC prices to OHLC prices in the period
    
    Parameters
    ----------
    open_prices : DataFrame
        Daily open prices for each ticker and date
    high_prices : DataFrame
        Daily high prices for each ticker and date
    low_prices : DataFrame
        Daily low prices for each ticker and date
    close_prices : DataFrame
        Daily close prices for each ticker and date
    period: the resample period e.g W, H, D etc

    Returns
    -------
    open_prices_weekly : DataFrame
        Resampled open prices for each ticker and date
    high_prices_weekly : DataFrame
        Resampled high prices for each ticker and date
    low_prices_weekly : DataFrame
        Resampled low prices for each ticker and date
    close_prices_weekly : DataFrame
        Resampled close prices for each ticker and date
    """
    
    # TODO: Implement Function
    
    open_prices_weekly = open_prices.resample('H').first()
    high_prices_weekly = high_prices.resample('H').max()
    low_prices_weekly = low_prices.resample('H').min()
    close_prices_weekly = close_prices.resample('H').last()
    
    return open_prices_weekly, high_prices_weekly, low_prices_weekly, close_prices_weekly

# Bitcoin Price Data

To begin, let's take a look a historical bitcoin (btc) price data.

We use yfinance to source the data, which includes the open, high, low, close (OHLC) data in addition to the adjusted close price and the volume traded that day.

The open price is the price of the stock at the start of that trading data, the high is the highest price the stock attains over the course of the day, the low is likewise the lowest price over the day and the close is the price of the stock at the end of trading that day. 

The adjusted close is to take into account things like dividends being paid out.

The volume is the number of shares traded over the day.Generally large volumes of buy orders would increase a stock's price and large volumes of sell orders would decrease its price.

In [None]:
df = pd.read_csv('Data/bitstampUSD_1-min_data_2012-01-01_to_2020-12-31.csv')

In [None]:
# Get a datetime from the timestamp
df['Datetime'] = df['Timestamp'].apply(datetime.fromtimestamp)

In [None]:
df.set_index('Datetime', inplace=True)

In [None]:
df.drop(columns=['Timestamp', 'Open', 'High', 'Low', 'Volume_(BTC)', 'Volume_(Currency)', 'Weighted_Price'], inplace=True)

In [None]:
df.head()

In [None]:
# Resample to just hourly
hourly_df = df.resample('H').last()

In [None]:
hourly_df.head()

In [None]:
# How many NaN do we have?
print(f"The percentage of NaN values is {100*hourly_df['Close'].isna().sum()/hourly_df.shape[0]:.2f}%")

In [None]:
recent_df = hourly_df[hourly_df.index > '2018-01-01']

In [None]:
recent_df.head()

In [None]:
print(f"The percentage of recent NaN values is {100*recent_df['Close'].isna().sum()/recent_df.shape[0]:.2f}%")

In [None]:
# After 2018 not many missing hourly close prices....We could impute by just assuming the missing close 
recent_df[recent_df['Close'].isna()]

In [None]:
int_df = recent_df.fillna(method='ffill')

In [None]:
int_df[int_df['Close'].isna()]

In [None]:
recent_df[(recent_df.index > '2020-04-25 15:00') & (recent_df.index < '2020-04-25 23:00') ]

In [None]:
int_df[(int_df.index > '2020-04-25 15:00') & (int_df.index < '2020-04-25 23:00') ]

In [None]:
len(int_df)

In [None]:
int_df.tail()

In [None]:
int_df[int_df.index > '2020-09-01']

## Some config

Here are a bunch of LSTM helper functions

In [None]:
def split_data(data, training_size=0.8):
    """
    Split the data into training and test sets
    We want to preserve order and not shuffle at this stage as past points will be used to predict next in the sequence
    The test set will represent the unseen future
    However the windowed dataset will shuffle around each windows and along with target label (see that func)
    
    Params:
        data: the dataset
        training_sie: the split e.g. 0.8 means 80% of data is used in the training set
    """
    return data[:int(training_size*len(data))], data[int(training_size*len(data)):]

In [None]:
def windowed_dataset(series, shuffle_buffer, window_len, batch_size, window_scaling=False):
    """
    If we have a series like [1,2,3,4,5,6]
    We want to split it into windows, e.g. we take previous value of the series
    as the X input features and want to predict the following value as output Y
    e.g. [1,2] - > 3, so what we want to do is split the data into windows
    of length window_len + 1 (the +1 acconts for the label)
    E.g.
      [1, 2, 3]
      [2, 3, 4]
      [3, 4, 5]
      [4, 5, 6]


    We shuffle the data to avoid bias

    Finally we split the example into input/target

    [[1, 2], [3]]
    .
    .

    so it's appropriate to feed into the model.fit    

    and we batch it into batches of batch_size
    
    Params:
        series: the series upon which we perform the windowing
        shuffle_buffer: size of the buffer when shuffling
            (see https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=nightly#shuffle)
        window_len; how many previous elements to take into account when predicting the next
        batch_size: https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=nightly#batch
    """
    # Initially the data is (1188,) expand dims to TensorShape([1188, 1])
    series = tf.expand_dims(series, axis=-1)
    
    # https://www.tensorflow.org/api_docs/python/tf/data/Dataset
    # will be an iterable of tf.Tensor([998.325], shape=(1,), dtype=float32),...
    ds = tf.data.Dataset.from_tensor_slices(series)
    
    # https://stackoverflow.com/questions/55429307/how-to-use-windows-created-by-the-dataset-window-method-in-tensorflow-2-0
    # The +1 accounts for the label too. Create a bunch of windows over our series
    # If we started with ds = tf.data.Dataset.from_tensor_slices([1,2,3,4,5])
    # then ds = ds.window(3, shift=1, drop_remainder=False) would lead
    # to [1,2,3], [2, 3, 4], [3, 4, 5], [4, 5], [5] whereas 
    # drop_remainder=True) => [1,2,3], [2, 3, 4], [3, 4, 5]
    # Remember the first window_len are our training data and the 1 is 
    # the target/label
    # Could also do this with pandas shift
    ds = ds.window(window_len + 1, shift=1, drop_remainder=True)
    #for w in ds:
    #    print(list(w.as_numpy_iterator()))
    
    # Maps map_func across this dataset and flattens the result
    ds = ds.flat_map(lambda w: w.batch(window_len + 1))

    # Instead of standard scaling all the data, sometimes people
    # normalize the window itself wrt to initial element
    def normalize_window(w):
      return (w/w[0]) -1
    if window_scaling:
      ds = ds.map(normalize_window)

    # randomize order 
    ds = ds.shuffle(shuffle_buffer)
    
    # Collect the inputs and the label
    ds = ds.map(lambda w: (w[:-1], w[-1]))

    return ds.batch(batch_size).prefetch(1)

In [None]:
def preprocessing(df, scaler='standard', window_scaling=False, colname='Close', window_len=5, start_date=None, end_date=None, shuffle_buffer=1000,
                  batch_size=128, **kwargs):
    """
    Data preprocessing.
     - First keep only data between the start and end date
     - Compute the log returns from the adjusted close prices
     - Use a standard scalar to normalize that data
     - Split into training and test sets
     
     Params:
         df - The OHLC dataframe
         colname - The column we want to make predictions for (close prices)
         window_len - how many elements to use from the series when predicting the next
         start_date/end_date - The data range to model over (for example we may want to exclude
            the early days of bitcoin with the long tail)
        shuffle_buffer - buffer size for shuffling
        batch_size - size of batches
     
     Returns:
         training_price_zero - the initial price in the training set (useful when reconstructing prices from rets)
         test_price_zero - the initial price in the test set (useful when reconstructing prices from rets)
         scaler - the standard scalar (use it to do inverse transform later)
         
         model_training_data - the windowed dataset and target labels to train the NN on
         
         training_data - the series of log returns in the training set
         test_data - the series of log returns in the test set
         
         training_dates - the date series for training set
         test_dates - the date series for test set
    """

    # Date range of interest
    temp_df = df
    if start_date is not None:
        temp_df = temp_df[temp_df.index >= start_date]
    if end_date is not None:
        temp_df = temp_df[temp_df.index <= end_date]
        
    
    # First get the log returns
    prices_df = temp_df[colname]
        
    # Split into training/test datasets
    training_df, test_df = split_data(prices_df)
    
    # Want to normalize the log returns (must use same scaler on test and train
    # since not supposed to know about test set)
    if scaler == 'standard':
      print('Standard scaler')
      sc = StandardScaler()
    elif scaler == 'robust':
      sc = RobustScaler()
      print('Robust scaler')
    elif scaler == 'minmax':
      sc = MinMaxScaler()
      print('MinMax scaler')
    else:
      sc = None

    if sc is not None:
      # Fit on training, transform only the test
      training_data = sc.fit_transform(training_df.values.reshape(-1, 1)).flatten()
      test_data = sc.transform(test_df.values.reshape(-1, 1)).flatten()
      # Remember sc.inverse_transform should transform back the data too so we return the scaler too
    else:
      print('No scaling')
      training_data = training_df.values
      test_data = test_df.values
      
    # Windowed/batched training data to feed the model
    windowed_training_data = windowed_dataset(training_data, shuffle_buffer, 
                                              window_len, batch_size, window_scaling=window_scaling)
    
    # # This will help with the displaying of results etc
    # Training and test dates for plotting comparisons
    training_dates = training_df.iloc[window_len: ].index
    test_dates = test_df.iloc[window_len:].index

    return sc, windowed_training_data, training_data, test_data, training_dates, test_dates
    

In [None]:
def build_model(output_size, neurons, activ_func, dropout, loss, optimizer):
  """
  The keras model. Will try a Conv layer initially followed by a bunch of LSTM layers with dropout.
  
  Params:
     output_size: e.g. predict 1 point in the future
     neurons: how many nuerons for each LSTM later
     activ_func: activation func, e.g. tanh
     loss: loss function to use, e.g. mse
     optimizer: e.g. adam
     
  Returns:
     The model
  """
  model = tf.keras.models.Sequential([
  tf.keras.layers.Conv1D(filters=25, kernel_size=5,
                         strides=1, padding="causal",
                         activation="relu",
                         input_shape=[None, 1]),
  # tf.keras.layers.LSTM(neurons, input_shape=[None, 1], return_sequences=True, activation=activ_func),
  tf.keras.layers.LSTM(neurons, return_sequences=True, activation=activ_func),
  tf.keras.layers.Dropout(dropout),
#   tf.keras.layers.LSTM(neurons, return_sequences=True, activation=activ_func),
#   tf.keras.layers.Dropout(dropout),
#   tf.keras.layers.LSTM(neurons, return_sequences=True, activation=activ_func),
#   tf.keras.layers.Dropout(dropout),
  tf.keras.layers.LSTM(neurons, return_sequences=False, activation=activ_func),
  tf.keras.layers.Dropout(dropout),
  tf.keras.layers.Dense(units=output_size, activation='linear'),
  ])
  model.compile(loss=loss, optimizer=optimizer, metrics=['mae'])
  model.summary()
  return model

In [None]:
def model_forecast(model, series, window_len):
    """
    Take the model we just trained and make predictions.
    
    We window the dataset then try to predict the next values after the window.
    Note we do not shuffle this time as we are predicting not training, and want to compare also with
    actual prices.
    
    Parameters:
        model: the ML model trained
        series: the series on which to make predictions
        window_len: size of our window for making preds, e.g. previous 5 elem to predict next perhaps
    """
    # Initially the data is (N,) expand dims to TensorShape([N, 1])
    series = tf.expand_dims(series, axis=-1)

    # Now we just use window_len not +1, because we just want inputs not label, and we predict label
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_len, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_len))

    ds = ds.batch(32).prefetch(1)

    return model.predict(ds)

In [None]:
def denormalize_forecast(forecast, orig_data):
    """
    Convert the predictions back after window normalization

    Params:
      forecast: our predictions which have been normalized by the first element in 
                the orig data window
      orig_data: the original dataset used to make the predictions
    """
    new_ps = []
    for n, p in enumerate(forecast):
      w_0 = orig_data[n]
      new_p = (p+1) * w_0
      new_ps.append(new_p)
    return new_ps

In [None]:
def display_results(model, scaler, dataset, dates, window_len, output_size):
    """
    With our predictions we de-normalize the standardized log returns predicted, then convert them into raw
    returns, and finally prices.
    
    We plot those prices against the actual prices in the same date range
    
    We compute the MAE and print it.
    
    Params:
        model: the training model
        scaler: the scalar we can use to invert the normalization
        dataset: Maybe this is train or test set
        dates: the dates over this training set for which we expect predictions
        window_len: how many previous points used when predicting the next
    Returns:
        None
    """
   
    preds = model_forecast(model, dataset, window_len)
   
   
    # E.g if window_len is 5, we have predictions for [5:]  since [0, 1, 2, 3, 4] -> [5] etc. If the output_size=1
    # then we neglect the final pred since it uses the final 5 elements of training set to pred a subsequent element, which
    # we have no training data to compare with
    res_df = pd.DataFrame({'y': dataset.flatten()[window_len:], 'yhat': preds.flatten()[:-output_size]})
    
    # Want to inverse the normalization transform
    if scaler is not None:
      res_df['y_prices'] = scaler.inverse_transform(res_df['y'].values.reshape(-1, 1)).flatten()
      res_df['yhat_prices'] = scaler.inverse_transform(res_df['yhat'].values.reshape(-1, 1)).flatten()
    else:
      # Window scaling
      res_df['y_prices'] = res_df['y']
      res_df['yhat_prices'] = denormalize_forecast(res_df['yhat'], dataset)
                              

    # Plot
    fig = go.Figure()
    fig.add_scatter(x=dates, y=res_df['y_prices'], mode='lines', name="Actual") 

    fig.add_scatter(x=dates, y=res_df['yhat_prices'], mode='lines', name="Predicted") 

    fig.update_layout(template = 'plotly_dark',
                      xaxis_title="Time",
                      yaxis_title="Price",)


    fig.show()
                                                  
    # Print the MAE                              
    mae = mean_absolute_error(res_df['y'], res_df['yhat'])
    print(f'The MAE is {mae}')

In [None]:
def display_history(hist):
    """
    Simply plot the training history such as the MAE and Loss over time
    
    Params:
       hist: the history obj returned by Keras
       
    Returns:
        None
    """
    fig = go.Figure(data=go.Scatter(y=hist.history['loss']))
    fig.add_trace(go.Scatter(y=hist.history['mae'],
                        mode='lines+markers',
                        name='MAE'))
    fig.update_layout(template = 'plotly_dark',
                      xaxis_title="Time",
                      yaxis_title="Loss",)


    fig.show()

## Attempt 1

In [None]:
## Config dict that defines this attempt
config1 = {
    "neurons": 20,                 # number of hidden units in the LSTM layer
    "activation_function": 'tanh',   # activation function for LSTM and Dense layer
    "loss": 'mse',                   # loss function for calculating the gradient, in this case Mean Squared Error. Could be mae
    "optimizer": 'adam',              # optimizer for appljying gradient decent
    "dropout": 0.25,                 # dropout ratio used after each LSTM layer to avoid overfitting
    "batch_size": 128,              
    "epochs": 40,                  
    "window_len": 15,                 # is an int to be used as the look back window for creating a single input sample.
    "training_size": 0.8,            # porportion of data to be used for training
    "shuffle_buffer": 1000,          # When shuffling the windowed Dataset how many at once to load into memory
    "output_size": 1,
    "start_date": None,
    "end_date": None,
}

In [None]:
# Clean up the memory
tf.keras.backend.clear_session()
btc_model1 = build_model(config1['output_size'], config1['neurons'], config1['activation_function'],
                         config1['dropout'], config1['loss'], config1['optimizer'])

In [None]:
scaler1, model_training_data1, training_data1, test_data1,training_dates1, test_dates1 = preprocessing(int_df, scaler='minmax', **config1)

In [None]:
training_data1.shape

In [None]:
btc_history1 = btc_model1.fit(model_training_data1, epochs=config1['epochs'], batch_size=config1['batch_size'], verbose=1)

In [None]:
display_history(btc_history1)

In [None]:
btc_model1.save('btcmodel_prices_minmax_1.p')

In [None]:
display_results(btc_model1, scaler1, training_data1, training_dates1, config1['window_len'],
                config1['output_size'])

In [None]:
display_results(btc_model1, scaler1, test_data1, test_dates1, config1['window_len'],
                config1['output_size'])

# Attempt 2

Let's narrow the date window and just train from 2020-09-01 and see if we can get better on the small test set which will be the tail of 2020

In [None]:
## Config dict that defines this attempt
config2 = {
    "neurons": 512,                 # number of hidden units in the LSTM layer
    "activation_function": 'tanh',   # activation function for LSTM and Dense layer
    "loss": 'mse',                   # loss function for calculating the gradient, in this case Mean Squared Error. Could be mae
    "optimizer": 'adam',              # optimizer for appljying gradient decent
    "dropout": 0.25,                 # dropout ratio used after each LSTM layer to avoid overfitting
    "batch_size": 128,              
    "epochs": 50,                  
    "window_len": 15,                 # is an int to be used as the look back window for creating a single input sample.
    "training_size": 0.8,            # porportion of data to be used for training
    "shuffle_buffer": 1000,          # When shuffling the windowed Dataset how many at once to load into memory
    "output_size": 1,
    "start_date": '2020-09-01',
    "end_date": None,
}

In [None]:
# Clean up the memory
tf.keras.backend.clear_session()
btc_model2 = build_model(config2['output_size'], config2['neurons'], config2['activation_function'],
                         config2['dropout'], config2['loss'], config2['optimizer'])

In [None]:
scaler2, model_training_data2, training_data2, test_data2, training_dates2, test_dates2 = preprocessing(int_df, scaler='minmax', **config1)

In [None]:
btc_history2 = btc_model2.fit(model_training_data2, epochs=config2['epochs'], batch_size=config2['batch_size'], verbose=1)

In [None]:
display_results(btc_model2, scaler2, training_data2, training_dates2, config2['window_len'],
                config2['output_size'])

In [None]:
display_results(btc_model2, scaler2, test_data2, test_dates2, config2['window_len'],
                config2['output_size'])