In [40]:
# Removing warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Data Management
import pandas as pd
import numpy as np
from pandas_datareader.data import DataReader
from ta import add_all_ta_features

# Statistics
from statsmodels.tsa.stattools import adfuller

# Unsupervised Machine Learning
from sklearn.decomposition import PCA

# Supervised Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# Reporting
import matplotlib.pyplot as plt

# Formatting
import json
import requests
from datetime import datetime

### Initial Data Extraction

In [52]:
# Data Extraction
symbol = "BTCUSD"
interval = "1d"
limit = 1000
candles = requests.get(f"https://api.binance.us/api/v3/klines?symbol={symbol}&interval={interval}&limit={limit}")
candles_json = []
if candles.status_code == 200:
    candles_json = json.loads(candles.text)
else:
    print(f"Candlestick data not received: ${candles.status_code} received instead!")
    
for i in range(len(candles_json)):
    for j in range(len(candles_json[i])):
        if j == 0 or j == 6:
            temp_val = candles_json[i][j]
            temp_date = datetime.fromtimestamp(temp_val/1000)
            candles_json[i][j] = temp_date.strftime('%m/%d/%Y')

df_columns = ["Open Date", "Open", "High", "Low", "Close", "Volume", "Close Date", "Quote Asset Volume",
             "Number of Trades", "Taker Buy Base Asset Volume", "Taker Buy Quote Asset Volume", "Ignore"]
df = pd.DataFrame(data=candles_json, columns=df_columns)

# Set index to date
df = df.set_index("Open Date")
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Close Date,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,Ignore
Open Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
05/19/2020,9780.25,9832.0,9275.82,9512.39,700.740544,05/20/2020,6717332.556,12043,328.772321,3153480.2502,0
05/20/2020,9506.63,9561.15,8800.0,9055.57,874.97111,05/21/2020,7991602.167,14845,398.856311,3646632.353,0
05/21/2020,9054.75,9259.08,8934.0,9166.06,556.309475,05/22/2020,5068984.2303,9744,278.647637,2537846.2199,0
05/22/2020,9166.06,9312.7,9087.11,9183.4,264.651102,05/23/2020,2439152.1853,6010,132.287458,1219085.7827,0
05/23/2020,9183.4,9297.57,8688.0,8706.69,507.398787,05/24/2020,4570583.5813,10221,203.341634,1836257.2152,0


### Data Preprocessing

In [53]:
# Change data types to floats
df["Open"] = pd.to_numeric(df["Open"])
df["High"] = pd.to_numeric(df["High"])
df["Low"] = pd.to_numeric(df["Low"])
df["Close"] = pd.to_numeric(df["Close"])
df["Volume"] = pd.to_numeric(df["Volume"])

# Filters out data frame with only relevant data points
df = df[["Open", "High", "Low", "Close", "Volume"]]

# Add TA
df = add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
Open Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
05/19/2020,9780.25,9832.0,9275.82,9512.39,700.740544,-104.623494,700.740544,-0.149304,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9512.39,-69.747462,0.0,0.0
05/20/2020,9506.63,9561.15,8800.0,9055.57,874.97111,-392.019575,-174.230566,-0.248789,-399704.30247,-32476950000.0,...,-0.384463,-0.076893,-0.30757,1.947566,0.389513,1.558053,9271.699124,-4.802368,-4.921512,-4.802368
05/21/2020,9054.75,9259.08,8934.0,9166.06,556.309475,-154.080393,382.078909,-0.07227,-333822.740133,-4910594000.0,...,-0.589749,-0.179464,-0.410286,-0.112627,0.289085,-0.401712,9219.04829,1.220133,1.212749,-3.640831
05/22/2020,9166.06,9312.7,9087.11,9183.4,264.651102,-192.806102,646.730011,-0.080447,-285478.19867,8810887000.0,...,-0.730055,-0.289582,-0.440473,-5.310153,-0.830762,-4.47939,9200.210565,0.189176,0.188997,-3.458542
05/23/2020,9183.4,9297.57,8688.0,8706.69,507.398787,-669.090224,139.331224,-0.230397,-279250.18111,-24882630000.0,...,-1.240815,-0.479829,-0.760986,-6.54294,-1.973198,-4.569742,8950.090528,-5.190997,-5.330581,-8.470006


### Checking For Stationarity (Data Points that do not differ from the mean over time)

In [56]:
# Identify non-stationary columns
non_stationaries = []
for col in df.columns:
    dftest = adfuller(df[col].values)
    p_value = dftest[1]
    t_test = dftest[0] < dftest[4]["1%"]
    if p_value < 0.05 or not t_test:
        non_stationaries.append(col)
print(f"Non-stationaries Features Found: {len(non_stationaries)}")

Non-stationaries Features Found: 91


In [57]:
# Convert non-stationaries to stationaries
df_stationary = df.copy()
df_stationary[non_stationaries] = df_stationary[non_stationaries].pct_change()
df_stationary = df_stationary.iloc[1:]

In [58]:
# Find NaN Rows
na_list = df_stationary.columns[df_stationary.isna().any().tolist()]
df_stationary.drop(columns=na_list, inplace=True)

In [59]:
# Handle inf values
df_stationary.replace([np.inf, -np.inf], 0, inplace=True)
df_stationary.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
Open Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
05/20/2020,-0.027977,-0.027548,-0.051297,-0.048024,0.248638,2.746955,-1.248638,0.666322,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.025303,-0.931146,0.0,0.0
05/21/2020,-0.047533,-0.031593,0.015227,0.012201,-0.364197,-0.606957,-3.19295,-0.709514,-0.164826,-0.848798,...,0.533958,1.333958,0.333958,-1.05783,-0.25783,-1.25783,-0.005679,-1.254069,-1.246418,-0.241868
05/22/2020,0.012293,0.005791,0.017138,0.001892,-0.524274,0.251334,0.692661,0.113157,-0.144821,-2.794261,...,0.237907,0.613595,0.073577,46.147984,-3.873765,10.150739,-0.002043,-0.844954,-0.844158,-0.050068
05/23/2020,0.001892,-0.001625,-0.04392,-0.05191,0.917237,2.470275,-0.78456,1.863949,-0.021816,-3.824077,...,0.699618,0.656969,0.727657,0.232157,1.375165,0.020171,-0.027186,-28.440016,-29.204514,1.449011
05/24/2020,-0.051393,-0.030818,-0.006118,0.022472,0.254361,-0.401567,4.567974,-0.509144,-0.206563,-0.59656,...,0.178828,0.409679,0.033269,-0.112576,0.388523,-0.328949,-0.002198,-1.432911,-1.416907,-0.242845


### Data Preprocessing - Scaling and Target Setting

In [61]:
# Set Target
df_stationary["TARGET"] = -1
df_stationary.loc[df_stationary["Close"].shift(-1) > df_stationary["Close"], "TARGET"] = 1
df_stationary.dropna(inplace=True)

In [62]:
# Split target from featureset
X = df_stationary.iloc[:, :-1]
y = df_stationary.iloc[:, -1]

In [63]:
# Feature Scaling
df_sc = df_stationary.copy()
X_fs = StandardScaler().fit_transform(X)

In [64]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X_fs, y, test_size=0.7, random_state=42)

### Unsupervised Learning - PCA Dimensionality Reduction

In [87]:
# PCA
n_components = 17
pca = PCA(n_components=n_components)
pca_result = pca.fit(X_train)
X_train_pca = pca_result.transform(X_train)
X_test_pca = pca_result.transform(X_test)

In [88]:
# Calculate the variance explained by Principle Components
print("Variance of each component: ", pca.explained_variance_ratio_)
print("\n Total variance explained: ", round(sum(list(pca.explained_variance_ratio_)) * 100, 2))

Variance of each component:  [0.12475779 0.10050543 0.09734391 0.06478563 0.04941176 0.04623053
 0.04540682 0.04401604 0.03935401 0.03645124 0.02832154 0.02540382
 0.02281844 0.02272733 0.02006395 0.01834682 0.01768307]

 Total variance explained:  80.36


In [89]:
# Creating columns
pca_cols = []
for i in range(n_components):
    pca_cols.append(f"PC_{i}")
pca_cols

['PC_0',
 'PC_1',
 'PC_2',
 'PC_3',
 'PC_4',
 'PC_5',
 'PC_6',
 'PC_7',
 'PC_8',
 'PC_9',
 'PC_10',
 'PC_11',
 'PC_12',
 'PC_13',
 'PC_14',
 'PC_15',
 'PC_16']

In [91]:
# Create and View Dataframe
df_pca = pd.DataFrame(data=X_train_pca, columns=pca_cols)
df_pca.head()

Unnamed: 0,PC_0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,PC_11,PC_12,PC_13,PC_14,PC_15,PC_16
0,-2.226906,-0.874187,0.188759,-2.131936,1.377374,-0.764464,0.198488,0.257763,1.048275,-0.18394,0.601219,-0.587656,-0.477415,0.294179,-0.717208,-0.275216,0.129066
1,2.623138,-1.832555,1.515441,1.561581,0.210771,-0.850661,0.401006,-0.931756,0.014766,1.664756,3.98636,1.402284,-1.271961,0.605059,-0.168555,0.02945,-0.58747
2,-2.415648,0.115571,-0.849107,-2.075966,0.739141,-0.878941,0.568591,0.005164,0.821512,-0.056572,1.322068,-0.262263,0.028755,-0.490601,0.306895,-0.940849,-0.311643
3,3.392408,-0.772827,0.341247,-0.207115,0.088353,-0.453602,0.050967,-0.506326,0.293432,0.014074,0.918074,0.581437,-0.201066,0.021115,0.481773,-1.465042,-0.181078
4,-4.116028,1.802631,-0.907461,-2.830952,1.715316,-2.235737,0.998334,-0.038515,3.822732,0.158246,1.93359,0.345749,0.060627,0.356596,-0.002455,-0.713279,1.192893
