In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import gc
import traceback
import datatable as dt
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from tensorflow.keras import layers
import tensorflow_probability as tfp
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# from ta import add_all_ta_features
# from ta.utils import dropna

In [3]:
# Set graph style and font
sns.set()
# Change the axes' title and label size to 18 & 16 by default and default figure size, and make title bold
# Axes formatter limit will only display scientific notation if it's > 10^7 (or 10 million JPY) or < 10^-5
plt.rcParams.update({'axes.titleweight': 'bold','figure.figsize': (16,10),'axes.titlesize': 18,'axes.labelsize': 16, 
                     'legend.fontsize': 12, 'xtick.labelsize': 12, 'ytick.labelsize': 12, 'font.family': 'serif', 
                     'axes.formatter.limits':'-5, 7'}) 

# Loading data

In [4]:
# For Kaggle use only
directory_path = "/kaggle/input/392-crypto-currency-pairs-at-minute-resolution/"
BTC = pd.read_csv(directory_path + 'btcusd.csv')
print(BTC.head())
ETH = pd.read_csv(directory_path+'ethusd.csv')
print(ETH.head())
LTC = pd.read_csv(directory_path + 'ltcusd.csv')
print(LTC.head())

In [5]:
# Convert to human timestamp
BTC['time'] = pd.to_datetime(BTC['time'], unit='ms')
ETH['time'] = pd.to_datetime(ETH['time'], unit='ms')
LTC['time'] = pd.to_datetime(LTC['time'], unit='ms')

In [6]:
BTC.describe(include='all')

In [7]:
ETH.describe(include='all')

In [8]:
LTC.describe(include='all')

In [9]:
# Make copies of these df before doing further
BTC_copy = BTC.copy()
ETH_copy = ETH.copy()
LTC_copy = LTC.copy()

In [10]:
# Set time as index for plotting
BTC.set_index('time', inplace=True)
ETH.set_index('time', inplace=True)
LTC.set_index('time', inplace=True)
BTC

# EDA

In [11]:
plt.plot(BTC.index, BTC.close)
plt.xlabel('Date')
plt.ylabel('Price in USD')
plt.title('Price of BTC over years')
plt.show()

In [12]:
plt.plot(ETH.index, ETH.close)
plt.xlabel('Date')
plt.ylabel('Price in USD')
plt.title('Price of ETH over years')
plt.show()

In [13]:
plt.plot(LTC.index, LTC.close)
plt.xlabel('Date')
plt.ylabel('Price in USD')
plt.title('Price of LTC over years')
plt.show()

In [14]:
print(BTC.isnull().sum())
print(ETH.isnull().sum())
print(LTC.isnull().sum())

No null values at all, no need to drop any data

In [15]:
# Use only data from the last 2 years for modelling
BTC_2yr = BTC['2020-01-01':]
ETH_2yr = ETH['2020-01-01':]
LTC_2yr = LTC['2020-01-01':]
BTC_2yr

# Feature engineering

In [16]:
def upper_shadow(df): return df['high'] - np.maximum(df['close'], df['open'])
def lower_shadow(df): return np.minimum(df['close'], df['open']) - df['low']

def get_features(df, row = False):
    df_feat = df
    df_feat['spread'] = df_feat['high'] - df_feat['low']
    df_feat['upper_shadow'] = upper_shadow(df_feat)
    df_feat['lower_shadow'] = lower_shadow(df_feat)
    df_feat['close-open'] = df_feat['close'] - df_feat['open']
    df_feat['SMA_7'] = df_feat.iloc[:,1].rolling(window=7).mean()
    df_feat['SMA_14'] = df_feat.iloc[:,1].rolling(window=14).mean()
    df_feat['SMA_21'] = df_feat.iloc[:,1].rolling(window=21).mean()
    # Create the STD_DEV feature for the past 7 days
    df_feat['STD_DEV_7'] = df_feat.iloc[:,1].rolling(window=7).std()
    # Drop the NA rows created by the SMA indicators
    df_feat.dropna(inplace = True)
    return df_feat

In [17]:
BTC_2yr = get_features(BTC_2yr)
BTC_2yr

In [18]:
plt.plot(BTC_2yr.index, BTC_2yr['close'])
plt.show()

In [19]:
BTC_y = BTC_2yr['close']
BTC_X = BTC_2yr.drop('close', axis=1)
BTC_X

# Modelling

## Baseline model: linear regression

In [20]:
# 70% for training, 30% for testing
index_70pct = int(len(BTC_X)*0.7)
BTC_X_train = BTC_X[:index_70pct]
BTC_X_test = BTC_X[index_70pct:]
BTC_y_train = BTC_y[:index_70pct]
BTC_y_test = BTC_y[index_70pct:]
print(BTC_X_train)
print(BTC_y_test)

In [21]:
linreg = LinearRegression()
linreg.fit(BTC_X_train, BTC_y_train)

In [22]:
BTC_y_pred = linreg.predict(BTC_X_test)
BTC_y_pred

In [23]:
len(BTC_y_test)

In [24]:
mse = mean_squared_error(BTC_y_pred, BTC_y_test)
mse

In [25]:
plt.plot(BTC_y_train.index, BTC_y_train, color = 'y', label ='Training prices')
plt.plot(BTC_y_test.index, BTC_y_pred, color = 'b', label ='Test prices')
plt.legend()
plt.show()