In [1]:
import numpy as np
import pandas as pd
from typing import Dict, Union, Tuple, List, Any, Optional
import plotly.graph_objects as go


In [2]:
import pickle
with open('/content/drive/MyDrive/Teaching/Math 628 - Fall 2025/ticker_data_dict.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)


In [3]:
appl = loaded_dict['AAPL']

# Labellinbg (using RiskLabAI library)

## Triple Barrier Labeling

The **Triple Barrier Method** is used to label financial time-series data for supervised learning. It extends **fixed-time horizon labeling** by incorporating **profit-taking and stop-loss levels**, making it more robust.

### Definition

For each event $t_0$, we define three barriers:

1. **Upper Barrier (Profit-Taking)**: $P_t \geq P_{t_0} (1 + u)$
2. **Lower Barrier (Stop-Loss)**: $P_t \leq P_{t_0} (1 - l)$
3. **Vertical Barrier (Time Horizon)**: $t = t_0 + T$

where:
- $P_t$ is the asset price at time $t$.
- $u$ and $l$ are the **profit-taking** and **stop-loss** thresholds.
- $T$ is the **maximum holding period**.

### Labeling

For each event $t_0$, we find the first time $t^*$ where:

$$ P_{t^*} \geq P_{t_0} (1 + u) \quad \text{(Upper Barrier Hit)} $$
$$ P_{t^*} \leq P_{t_0} (1 - l) \quad \text{(Lower Barrier Hit)} $$
$$ t^* = t_0 + T \quad \text{(Vertical Barrier Hit)} $$

The label is assigned as:
- **Long Signal ($+1$):** If the **upper** barrier is hit first.
- **Short Signal ($-1$):** If the **lower** barrier is hit first.
- **No Signal ($0$):** If the **vertical** barrier is hit first.

### Return Calculation

The return at $t^*$ is:

$$ r = \frac{P_{t^*} - P_{t_0}}{P_{t_0}} $$



In [4]:
def cusum_filter_events_dynamic_threshold(
        prices: pd.Series,
        threshold: pd.Series
) -> pd.DatetimeIndex:
    """
    Detect events using the Symmetric Cumulative Sum (CUSUM) filter.

    The Symmetric CUSUM filter is a change-point detection algorithm used to identify events where the price difference
    exceeds a predefined threshold.

    :param prices: A pandas Series of prices.
    :param threshold: A pandas Series containing the predefined threshold values for event detection.
    :return: A pandas DatetimeIndex containing timestamps of detected events.

    References:
    - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. (Methodology: 39)
    """
    time_events, shift_positive, shift_negative = [], 0, 0
    price_delta = prices.diff().dropna()
    thresholds = threshold.copy()
    price_delta, thresholds = price_delta.align(thresholds, join="inner", copy=False)

    for (index, value), threshold_ in zip(price_delta.to_dict().items(), thresholds.to_dict().values()):
        shift_positive = max(0, shift_positive + value)
        shift_negative = min(0, shift_negative + value)

        if shift_negative < -threshold_:
            shift_negative = 0
            time_events.append(index)

        elif shift_positive > threshold_:
            shift_positive = 0
            time_events.append(index)

    return pd.DatetimeIndex(time_events)

In [5]:
def daily_volatility_with_log_returns(
        close: pd.Series,
        span: int = 100
) -> pd.Series:
    """
    Calculate the daily volatility at intraday estimation points using Exponentially Weighted Moving Average (EWMA).

    :param close: A pandas Series of daily close prices.
    :param span: The span parameter for the Exponentially Weighted Moving Average (EWMA).
    :return: A pandas Series containing daily volatilities.

    References:
    - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. (Methodology: Page 44)
    """
    df = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    df = df[df > 0]
    df = pd.Series(close.index[df - 1], index=close.index[close.shape[0] - df.shape[0]:])
    returns = np.log(close.loc[df.index] / close.loc[df.values].values)
    stds = returns.ewm(span=span).std().rename("std")

    return stds

In [6]:
def vertical_barrier(
    close: pd.Series,
    time_events: pd.DatetimeIndex,
    number_days: int
) -> pd.Series:
    """
    Shows one way to define a vertical barrier.

    :param close: A dataframe of prices and dates.
    :param time_events: A vector of timestamps.
    :param number_days: A number of days for the vertical barrier.
    :return: A pandas series with the timestamps of the vertical barriers.
    """
    timestamp_array = close.index.searchsorted(time_events + pd.Timedelta(days=number_days))
    timestamp_array = timestamp_array[timestamp_array < close.shape[0]]
    timestamp_array = pd.Series(close.index[timestamp_array], index=time_events[:timestamp_array.shape[0]])
    return timestamp_array


In [7]:
def triple_barrier(
    close: pd.Series,
    events: pd.DataFrame,
    profit_taking_stop_loss: list[float, float],
    molecule: list
) -> pd.DataFrame:
    # Filter molecule to ensure all timestamps exist in events
    molecule = [m for m in molecule if m in events.index]

    # Continue with the existing logic
    events_filtered = events.loc[molecule]
    output = events_filtered[['End Time']].copy(deep=True)

    if profit_taking_stop_loss[0] > 0:
        profit_taking = profit_taking_stop_loss[0] * events_filtered['Base Width']
    else:
        profit_taking = pd.Series(index=events.index)

    if profit_taking_stop_loss[1] > 0:
        stop_loss = -profit_taking_stop_loss[1] * events_filtered['Base Width']
    else:
        stop_loss = pd.Series(index=events.index)

    for location, timestamp in events_filtered['End Time'].fillna(close.index[-1]).items():
        df = close[location:timestamp]
        df = np.log(df / close[location]) * events_filtered.at[location, 'Side']
        output.loc[location, 'stop_loss'] = df[df < stop_loss[location]].index.min()
        output.loc[location, 'profit_taking'] = df[df > profit_taking[location]].index.min()

    return output


In [8]:
def meta_events(
    close: pd.Series,
    time_events: pd.DatetimeIndex,
    ptsl: List[float],
    target: pd.Series,
    return_min: float,
    num_threads: int,
    timestamp: pd.Series = False,
    side: pd.Series = None
) -> pd.DataFrame:
    # Filter target by time_events and return_min
    target = target.loc[time_events]
    target = target[target > return_min]

    # Ensure timestamp is correctly initialized
    if timestamp is False:
        timestamp = pd.Series(pd.NaT, index=time_events)
    else:
        timestamp = timestamp.loc[time_events]

    if side is None:
        side_position, profit_loss = pd.Series(1., index=target.index), [ptsl[0], ptsl[0]]
    else:
        side_position, profit_loss = side.loc[target.index], ptsl[:2]

    # Include 'target' and 'timestamp' in the events DataFrame
    events = pd.concat({'End Time': timestamp, 'Base Width': target, 'Side': side_position, 'target': target, 'timestamp': timestamp}, axis=1).dropna(subset=['Base Width'])


    df0 = list(map(
        triple_barrier,
        [close] * num_threads,
        [events] * num_threads,
        [profit_loss] * num_threads,
        np.array_split(time_events, num_threads)
    ))
    df0 = pd.concat(df0, axis=0)

    events['End Time'] = df0.dropna(how='all').min(axis=1)

    if side is None:
        events = events.drop('Side', axis=1)

    # Return events including the 'target' and 'timestamp' columns
    return events , df0


In [9]:
def meta_labeling(
    events: pd.DataFrame,
    close: pd.Series
) -> pd.DataFrame:
    """
    Expands label to incorporate meta-labeling.

    :param events: DataFrame with timestamp of vertical barrier and unit width of the horizontal barriers.
    :param close: Series of close prices with date indices.
    :return: DataFrame containing the return and binary labels for each event.

    Reference:
    De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons.
    Methodology: 51
    """
    events_filtered = events.dropna(subset=['End Time'])
    all_dates = events_filtered.index.union(events_filtered['End Time'].values).drop_duplicates()
    close_filtered = close.reindex(all_dates, method='bfill')
    out = pd.DataFrame(index=events_filtered.index)
    out['End Time'] = events['End Time']
    out['Return of Label'] = close_filtered.loc[events_filtered['End Time'].values].values / close_filtered.loc[events_filtered.index] - 1

    if 'Side' in events_filtered:
        out['Return of Label'] *= events_filtered['Side']
    out['Label'] = np.sign(out['Return of Label'])  * (1 - (events['End Time'] == events['timestamp']))
    if 'Side' in events_filtered:
        out.loc[out['Return of Label'] <= 0, 'Label'] = 0
        out['Side'] = events_filtered['Side']
    return out


In [35]:
prices = appl['Close']
volatility = daily_volatility_with_log_returns(prices, 30)
filter_threshold = 1.5
moelcules = cusum_filter_events_dynamic_threshold(np.log(prices), filter_threshold * volatility)
vertical_barriers = vertical_barrier(prices, moelcules, 20)

In [38]:
triple_barrier_events ,tt = meta_events(prices, vertical_barriers.index, [1, 1], volatility, 0, 1, vertical_barriers)
labels = meta_labeling(triple_barrier_events, prices)

In [None]:
pt, sl = triple_barrier_events['target'], triple_barrier_events['target']
upperbound, lowerbound = prices[triple_barrier_events.index]*(1 + pt), prices[triple_barrier_events.index]*(1 - sl)
i = 1
begining, ending = triple_barrier_events.index[i], vertical_barriers.loc[triple_barrier_events.index].iloc[i]
firsttouch = triple_barrier_events['End Time'].iloc[i]
pricetouch = prices.loc[firsttouch]
u, l = upperbound.iloc[i], lowerbound.iloc[i]
fig = go.Figure(data=[go.Scatter(x=prices.index[0:50], y=prices.values[0:50])])
fig.add_shape(type="line", x0=begining, y0=u, x1=begining, y1=l, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=begining, y0=u, x1=ending, y1=u, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=begining, y0=l, x1=ending, y1=l, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=ending, y0=u, x1=ending, y1=l, line=dict(color="red", width=2))
fig.add_trace(go.Scatter(x=[firsttouch], y=[pricetouch], marker=dict(size=[10], color=['red']), name="First Touch"))




i = 2
begining, ending = triple_barrier_events.index[i], vertical_barriers.loc[triple_barrier_events.index].iloc[i]
firsttouch = triple_barrier_events['End Time'].iloc[i]
pricetouch = prices.loc[firsttouch]
u, l = upperbound.iloc[i], lowerbound.iloc[i]

fig.add_shape(type="line", x0=begining, y0=u, x1=begining, y1=l, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=begining, y0=u, x1=ending, y1=u, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=begining, y0=l, x1=ending, y1=l, line=dict(color="red", width=2))
fig.add_shape(type="line", x0=ending, y0=u, x1=ending, y1=l, line=dict(color="red", width=2))
fig.add_trace(go.Scatter(x=[firsttouch], y=[pricetouch], marker=dict(size=[10], color=['red']), name="First Touch"))


fig.show()

In [43]:
apple_new = pd.merge(appl, labels, left_index=True, right_index=True)

In [None]:
apple_new

Unnamed: 0,Close,High,Low,Open,Volume,Ticker,Sector,SMA_20,SMA_50,EMA_20,...,MACD,MACD_Signal,Return,Lag_Return_1,Lag_Return_2,Lag_Return_3,Tomorrow_Return,End Time,Return of Label,Label
2005-05-12,1.028224,1.072208,1.024307,1.067087,970242000,AAPL,Information Technology,1.090300,1.190317,1.111300,...,-0.033615,-0.034476,-0.041561,-0.022240,-0.014877,-0.007251,0.018752,2005-05-16,0.041605,1.0
2005-05-18,1.079740,1.131559,1.054133,1.067991,636722800,AAPL,Information Technology,1.087212,1.174435,1.096687,...,-0.029792,-0.033021,0.013575,-0.005345,0.022433,0.018752,0.047712,2005-05-19,0.047712,1.0
2005-05-23,1.197837,1.202055,1.140295,1.140295,1042574400,AAPL,Information Technology,1.095045,1.171669,1.111994,...,-0.008713,-0.024586,0.058855,0.000000,0.047712,0.013575,-0.001509,2005-06-03,-0.038229,-1.0
2005-06-03,1.152045,1.162288,1.137886,1.149635,956869200,AAPL,Information Technology,1.138593,1.161046,1.160114,...,0.018225,0.009950,-0.044955,-0.006452,0.013582,-0.019724,-0.008368,2005-06-07,-0.044456,-1.0
2005-06-10,1.078837,1.126738,1.070100,1.126738,678932800,AAPL,Information Technology,1.145326,1.145133,1.141105,...,-0.003937,0.005172,-0.048871,0.019773,0.010399,-0.036393,0.002513,2005-06-15,0.036861,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-16,251.039993,251.380005,247.649994,247.990005,51694800,AAPL,Information Technology,239.157499,232.705213,240.924402,...,5.427631,4.436547,0.011728,0.000686,0.005964,-0.005166,0.009720,2024-12-18,-0.011910,-1.0
2024-12-18,248.050003,254.279999,247.740005,252.160004,56774100,AAPL,Information Technology,241.418999,233.796448,242.684918,...,5.498339,4.858558,-0.021422,0.009720,0.011728,0.000686,0.007015,2024-12-20,0.025963,1.0
2024-12-20,254.490005,255.000000,245.690002,248.039993,147495300,AAPL,Information Technology,243.756998,234.720528,244.421441,...,5.600482,5.090335,0.018816,0.007015,-0.021422,0.009720,0.003065,2024-12-24,0.014578,1.0
2024-12-30,252.199997,253.500000,250.750000,252.229996,35557500,AAPL,Information Technology,249.267999,237.218951,248.926981,...,5.675828,5.688294,-0.013263,-0.013242,0.003176,0.011478,-0.007058,2025-01-02,-0.033109,-1.0


## **Bayes' Rule for Feature Importance in Label Prediction**

### **1. Bayesian Formulation**

Given a dataset with labels $l$ and features $\theta $, we aim to measure the importance of a feature in predicting the label. We start with Bayes’ rule:

$$
P(l | \theta) = \frac{P(\theta | l) P(l)}{P(\theta)}
$$

Since $ P(\theta) $ does not depend on the label and serves as a normalizing constant, we focus on:

$$
P(l | \theta) \propto P(\theta | l) P(l)
$$

This formulation tells us that to estimate the probability of a label given a feature, we should study the **conditional distribution** $ P(\theta | l) $.

---

### **2. Feature Importance Interpretation**

A feature is **important** for label prediction if the distribution $ P(\theta | l) $ significantly varies across different labels. More specifically:

- If $ P(\theta | l) $ is well-separated for different values of $ l $, then $ \theta $ is **highly informative** for label prediction.
- If $ P(\theta | l)$ has significant **overlap** across different labels, then $ \theta $ does not provide strong predictive power.


**Boxplots**  
   - Show the spread of  $ \theta $  for different labels.
   - Large **interquartile range separation** indicates strong feature importance.



In [None]:
# prompt: for apple_new plot box plot .. in this way that for each label plot distribution of indicator features and Return

import plotly.graph_objects as go
import pandas as pd

def plot_indicator_distributions(df: pd.DataFrame, indicator_columns: list) -> None:
    """Plots box plots of indicator features for each label."""

    for col in indicator_columns:
        fig = go.Figure()

        for label in df['Label'].unique():
            if label != 0:
              fig.add_trace(go.Box(y=df[df['Label'] == label][col], name=label))
        fig.update_layout(title=f'Distribution of Indicators  {col}',
                         yaxis_title='Indicator Value',
                         boxmode='group')  # group together boxes of the same columns
        fig.show()


# Example usage (replace with your actual indicator columns)
indicator_cols = ['MACD_Signal', 'Return', 'SMA_20', 'SMA_50' , 'RSI'] #Example indicator columns, replace with your actual columns
plot_indicator_distributions(apple_new, indicator_cols)