## PCA for dimensionality reduction

In [1]:
# Preliminary code needed for importing from parent directory
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

# Import Karan's data API
from data import series

import numpy as np

# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html?highlight=pca#sklearn.decomposition.PCA
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import StandardScaler

## Generate Features & check stationarity

In [4]:
# Create a series for BTC-USDT pair on 1h candles
symbol = 'BTCUSDT'
timeframe = '1h'

btc = series.DataSeries(symbol, timeframe)
data = btc.getData()
# print(data.keys())

price_close = data['close']
# print(len(price_close))

prev5 = np.concatenate([
        # pivot timeframe
        price_close[np.newaxis, 5:],
        # previous 5 timeframes
        price_close[np.newaxis, 4:-1], # 1 frame ago
        price_close[np.newaxis, 3:-2], # 2 frame ago
        price_close[np.newaxis, 2:-3], # 3 frame ago
        price_close[np.newaxis, 1:-4], # 4 frame ago
        price_close[np.newaxis, :-5],  # 5 frame ago
    ],
    axis = 0
)

# Generate truth values (y)
y = prev5[0, :] > np.amin(prev5[1:, :], axis = 0)
print('timeframe:', timeframe)
print('number of times where trend is up: ', y[y == True].shape)
print('number of times where trend is down: ', y[y == False].shape)

btc.addIndicator('RSI', data['close'], 14) # 14-timeframe RSI

btc.addIndicator('EMA', data['close'], 30) # 30-timeframe EMA
# btc1h.addIndicator('EMA', btc1h.getData()['close'], 50) # 50-timeframe EMA

btc.addIndicator('SMA', data['close'], 30)

# ADX - Average Directional Movement Index
btc.addIndicator('ADX', data['high'], data['low'], data['close'], 14) # 14-timeframe ADX

# ATR - Average True Range
btc.addIndicator('ATR', data['high'], data['low'], data['close'], 14) # 14-timeframe ATR

## MFI: https://www.investopedia.com/terms/m/mfi.asp
btc.addIndicator('MFI', data['high'], data['low'], data['close'], data['volume'], 10) # 10-timeframe MFI

## MACD: https://www.investopedia.com/terms/m/macd.asp
# btc.addIndicator('MACD', data['close'], 12, 26) # fast = 12, slow = 26

indicators = btc.getIndicators()
for indicator in indicators.keys():
    print(indicator)
    
time_cut = 50

timeframe: 1h
number of times where trend is up:  (24655,)
number of times where trend is down:  (6013,)
RSI
EMA
SMA
ADX
ATR
MFI


### Why standardization is needed before PCA
* https://www.reneshbedre.com/blog/principal-component-analysis.html#standardization
* https://towardsai.net/p/data-science/how-when-and-why-should-you-normalize-standardize-rescale-your-data-3f083def38ff

In [5]:
# Pipeline of standardization & PCA

pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=5))])

# pipeline.fit_transform(data)