# EXPERIMENT ONE

#### Task :
1. Generate the features 
    - Time : Time of Day, Day of Week, Day of Month, Month
    - Indicators : Hawkes Process, ATR, EMA, RSI, Super Oscillator, Proximity to Liquidity Level
    - [FUTURE] Permutation Entropy for cluster labels
    - [FUTURE] HTF Features

2. Train the model

3. For every cluster label: 
    - Generate a signal array, applying the holding period as well
    - Generate the target array
        - Generate the returns. For every signal, assign the 0 or 1, if the 


In [None]:
# Import Libraries
from collections import deque
from pathlib import Path

import numpy as np
import pandas as pd
import pandas_ta as ta
from quantminer import Miner

aapl_path = Path.cwd().parent / 'data/aapl_h1.csv'
raw_data = pd.read_csv(aapl_path)

In [None]:
# Define the Hawkes process function
def hawkes_process(data: pd.Series, kappa: float=0.1):
    assert(kappa > 0.0)
    alpha = np.exp(-kappa)
    arr = data.to_numpy()
    output = np.zeros(len(data))
    output[:] = np.nan
    for i in range(1, len(data)):
        if np.isnan(output[i - 1]):
            output[i] = arr[i]
        else:
            output[i] = output[i - 1] * alpha + arr[i]
    return pd.Series(output, index=data.index) * kappa

def pivot_distances(data, window, pivots):
    df = data.copy()
    df = df.reset_index()

    # Initialize deques to store the 5 most recent pivot highs and lows
    recent_phigh = deque([[np.nan, np.nan]] * pivots, maxlen=pivots)
    recent_plow = deque([[np.nan, np.nan]] * pivots, maxlen=pivots)

    # Lists to store distances from recent pivot highs/lows for each row
    distances_from_pivot_highs = [[np.nan] * pivots] * (window * 2)
    distances_from_pivot_lows = [[np.nan] * pivots] * (window * 2)

    # Iterate over rows to identify pivots and calculate distances
    for row in df.itertuples():
        index = row.Index
        close = row.close
        
        if index < (window * 2):
            continue

        # Find new pivot highs/lows
        current_max = df['high'][index - window]
        current_min = df['low'][index - window]
        
        # Look window candles before and after
        max_range = df['high'][index - (window * 2) : index + 1]
        min_range = df['low'][index - (window * 2) : index + 1]

        # Check for / Store recent pivot high
        if current_max == max(max_range):
            # Replace the oldest pivot high in recent_phigh
            recent_phigh.append((index, current_max))
        
        # Check for pivot low
        if current_min == min(min_range):
            # Replace the oldest pivot low in recent_plow
            recent_plow.append((index, current_min))

        # # Clean up pivots that are surpassed by the current close
        # for i in range(len(recent_phigh)):
        #     if close > recent_phigh[i][1]:  # Check if the current close is higher than the pivot high
        #         recent_phigh[i] = (recent_phigh[i][0], np.nan)
        
        # for i in range(len(recent_plow)):
        #     if close < recent_plow[i][1]:  # Check if the current close is lower than the pivot low
        #         recent_plow[i] = (recent_plow[i][0], np.nan)

        # Calculate the distance between each points in recent_phigh/recent_plow and the current close
        ph_distances = [ph[1] - close if not np.isnan(ph[1]) else np.nan for ph in recent_phigh]
        pl_distances = [close - pl[1] if not np.isnan(pl[1]) else np.nan for pl in recent_plow]

        # Append the array of distances from the recent pivot highs/lows into their respective array
        distances_from_pivot_highs.append(ph_distances)
        distances_from_pivot_lows.append(pl_distances)

    distances_from_pivot_highs = np.array(distances_from_pivot_highs)
    distances_from_pivot_lows = np.array(distances_from_pivot_lows)

    for _ in range(pivots):
        p = _ + 1

        df[f'pivot_high_{p}'] = distances_from_pivot_highs[:, _]
        df[f'pivot_low_{p}'] = distances_from_pivot_lows[:, _]

        df[f'pivot_high_{p}'] = df[f'pivot_high_{p}'].fillna(0)
        df[f'pivot_low_{p}'] = df[f'pivot_low_{p}'].fillna(0)

    return df


def normalize_data(data):
    return np.sign(data) * np.log1p(np.abs(data))
    # return np.arcsinh(data)

def super_oscillator(data, length=10, multiplier=2.0, smooth=72):
    # Calculate Average True Range (ATR)
    atr = ta.atr(data['high'], data['low'], data['close'], length)
    
    # SuperTrend Calculation
    hl2 = (data['high'] + data['low']) / 2
    upper_band = hl2 + (multiplier * atr)
    lower_band = hl2 - (multiplier * atr)
    final_upper_band = upper_band.copy()
    final_lower_band = lower_band.copy()
    
    for i in range(1, len(data)):
        if data['close'].iloc[i-1] > final_upper_band.iloc[i-1]:
            final_upper_band.iloc[i] = min(upper_band.iloc[i], final_upper_band.iloc[i-1])
        if data['close'].iloc[i-1] < final_lower_band.iloc[i-1]:
            final_lower_band.iloc[i] = max(lower_band.iloc[i], final_lower_band.iloc[i-1])
    
    trend = (data['close'] > final_upper_band.shift(1)).astype(int) - (data['close'] < final_lower_band.shift(1)).astype(int)
    trend[trend == -1] = 0
    spt = trend * final_lower_band + (1 - trend) * final_upper_band
    
    # Oscillator Calculation
    osc = (data['close'] - spt) / (final_upper_band - final_lower_band)
    osc = osc.clip(-1, 1)
    
    # Adaptive Moving Average (AMA)
    alpha = 0.2  # Example fixed alpha value
    ama = osc.ewm(alpha=alpha).mean()
    
    # Histogram (EMA of the difference between osc and ama)
    hist = (osc - ama).ewm(span=smooth).mean()
    
    return osc, ama, hist

## PRE-STEP : FEATURE ENGINEERING

In [None]:
data = raw_data.copy()
data = data.set_index('datetime')
data.index = pd.to_datetime(data.index)

In [None]:
# TIME FEATURES
features = data.copy()
features['returns'] = features['close'].pct_change().fillna(1e-6)

# Extracting time components
data_hour = features.index.hour
data_day_of_week = features.index.dayofweek
data_day_of_month = features.index.day
data_month = features.index.month

# Converting to cyclic features using sine and cosine transformations
features['hour_sin'] = np.sin(2 * np.pi * data_hour / 24)
features['hour_cos'] = np.cos(2 * np.pi * data_hour / 24)
features['day_of_week_sin'] = np.sin(2 * np.pi * data_day_of_week / 7)
features['day_of_week_cos'] = np.cos(2 * np.pi * data_day_of_week / 7)
features['day_of_month_sin'] = np.sin(2 * np.pi * data_day_of_month / 31)
features['day_of_month_cos'] = np.cos(2 * np.pi * data_day_of_month / 31)
features['month_sin'] = np.sin(2 * np.pi * data_month / 12)
features['month_cos'] = np.cos(2 * np.pi * data_month / 12)

In [None]:
# INDICATORS, NORMALIZED
ATR = ta.atr(features['high'], features['low'], features['close'], 14)
EMA_FAST = ta.ema(features['close'], 7)
EMA_SLOW = ta.ema(features['close'], 35)
RSI = ta.rsi(features['close'], 14)

In [None]:
# INDICATOR : HAWKES PROCESS
returns_norm = features['returns']
atr_norm = (features['high'] - features['low'] ) / ATR
volume_change_norm = normalize_data(features['volume'].pct_change().fillna(1e-6))

# Apply the Hawkes process to the 'close' column
features['hawkes_returns'] = hawkes_process(returns_norm).fillna(0)
features['hawkes_volatility'] = hawkes_process(atr_norm).fillna(0)
features['hawkes_volume'] = hawkes_process(volume_change_norm).fillna(0)

In [None]:
# INDICATOR : SUPER OSCILLATOR
oscillator_results = super_oscillator(features)
features['suposc_oscillator'] = oscillator_results[0]
features['suposc_ama'] = oscillator_results[1]
features['suposc_histogram'] = oscillator_results[2]

In [None]:
# INDICATOR : DISTANCES FROM PIVOT POINTS
features_added = pivot_distances(features, 5, 5)
features = features_added.set_index('datetime', drop=True, append=False)
features.index = pd.to_datetime(features.index)

## PRE-STEP : MODEL TRAINING

In [None]:
# Feature Engineering
features['returns'] = features['close'].pct_change().fillna(0)

# Prepare the training data
train_daterange = pd.date_range('2001-01-01', '2021-12-31', freq='1h ')
train_df = features[features.index.isin(train_daterange)]
test_df = features[(features.index.year > train_daterange[-1].year)]
train_data = np.array(train_df['returns'])

In [None]:
# Parameters
n_pivots=3
n_clusters = 24
n_lookback=15
hold_period=3

miner = Miner(
    n_pivots=n_pivots,
    n_clusters=n_clusters,
    n_lookback=n_lookback,
    hold_period=hold_period,
    model_type='standard',
    reducer='Wavelet',
)

# Fit the model
miner.fit(train_data, np.array(train_df['close']))

In [None]:
features['cluster_labels'] = miner.transform(features['close']).astype(int)
train_df = features[features.index.isin(train_daterange)]
test_df = features[(features.index.year > train_daterange[-1].year)]

## STEP ONE : CLUSTER TESTING

In [147]:
labels = np.array(train_df['cluster_labels'])
rets = np.array(train_df['returns'])

for cluster in range(n_clusters):
    
    ret = []

    # Get the signal indices
    signals = labels == cluster
    signals = signals.astype(int)
    indices = np.where(signals == 1)[0]

    for index in indices:
        # Get the cummulative returns for the next [hold_period] bars
        ret = rets[index + 1 : index + hold_period + 2]
        print(np.sum(ret))
    


    # # Generate the signals

    # _signals : np.ndarray = miner.apply_holding_period(train_df['cluster_labels'], selected_labels=[cluster])
    # _signals = _signals != -1
    # _signals = _signals.astype(int)

    # for direction in [1, -1]:

    #     # Compute the returns
    #     _ret = train_df['returns'] * _signals * direction




0.019275205482102042
-0.028696715508337522
-0.006054585077562513
0.001814476177368185
-0.01668086963619797
0.004372693558125307
-0.023829005315409435
0.0067940868425836465
0.01734515612842591
-0.017830753065460447
0.015033431844134193
-0.005105554244321908
-0.006648822258325637
-0.04053907764414877
-0.008921848355881723
0.006329116884669639
0.00828149486628238
0.014136853144938843
0.014678095835265359
-0.002895915678524319
-0.0007224530691666287
-0.003400617702323583
0.0037903485760242273
-0.011398662530568315
-0.009620910345087497
0.009613334351973535
-0.010307100772141142
-0.006739350543317757
0.04305704452968406
-0.0074406476615055395
-0.012861153483566645
-0.009450253636300143
-0.006285692457316028
-0.006702509189974282
0.022227997155228274
-0.006356223996495269
-0.0031658316793404806
-0.0008864781593710402
-0.005504840274688894
-0.015199595955209588
0.004944535178308507
0.0020582812736023026
0.0008387265700560143
0.002133805670374911
-0.003906696454586411
0.008212076902671228
0.00

In [144]:
np.random.seed(14)

xx = np.random.rand(10)

xx, np.where(xx > 0.5)[0]

(array([0.51394334, 0.77316505, 0.87042769, 0.00804695, 0.30973593,
        0.95760374, 0.51311671, 0.31828442, 0.53919994, 0.22125494]),
 array([0, 1, 2, 5, 6, 8]))