# TARGET DETERMINATION FOR PIP MINER MODEL

This experiment is an extension of the `parameters` experiment. Given the range of with stable Martin Ratio:
- what cluster identity should be seleted? How can we combine them into a strategy?
- what could be the exit strategy for the  strategy?

In [6]:
# Import Necessary Libraries, Define the parameters
import logging
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_ta as ta  # noqa
import plotly.graph_objects as go  # noqa
import quantstats as qt
import seaborn as sns
import yfinance as yf
from quantminer import Miner

logger = logging.getLogger('optuna')
logger.setLevel(logging.WARNING)

data_dir = Path.cwd().parent / 'data'


### STEP 0 : DATA PREPARATION AND MODEL TRAINING
- Asset : EURUSD, 1-hour
- Parameter
  - n_pivots : 3; 4
  - n_clusters : 16; 15 
  - n_lookback : 8; 14
  - hold_period : 2, 3

In [1]:
from alpha_vantage.foreignexchange import ForeignExchange
import matplotlib.pyplot as plt
import pandas as pd


# Define your API Key
api_key = 'JMSB095DTZASMIRH'

# Set up the client
fx = ForeignExchange(key=api_key)

# Get EUR/USD data (interval can be 1min, 5min, 15min, 30min, or 60min)
# Alpha Vantage does not directly support 4-hour intervals, so you might need to resample 1-hour data
data, _ = fx.get_currency_exchange_intraday('EUR', 'USD', interval='60min', outputsize='full')

# Resample the 1-hour data to 4-hour data
data.index = pd.to_datetime(data.index)
four_hour_data = data['4. close'].resample('4H').last()  # Use 'last' to get the closing price of each 4-hour period

# Save to CSV
four_hour_data.to_csv('EURUSD_4H_data.csv')

# Plotting the data
four_hour_data.plot(title='EURUSD 4-Hour Closing Prices')
plt.xlabel('Time')
plt.ylabel('Price')
plt.show()

ModuleNotFoundError: No module named 'alpha_vantage'

In [27]:
# YAHOO FINANCE
# ticker = "EURUSD=X"

# # Download the data
# raw_data = yf.download(ticker, start='2001-01-01', end='2021-12-31')
# raw_data.columns = raw_data.columns.str.lower()
# raw_data = raw_data[['open', 'high', 'low', 'close', 'volume']]


[*********************100%%**********************]  1 of 1 completed

(4690, 5)





In [None]:
# # Read Price Data
# data_path = data_dir / 'eur_h1.parquet'
# raw_data = pd.read_parquet(data_path)

In [28]:
# Clean the data
data = raw_data.copy()
data = data.dropna()

# Feature Engineering
data['returns'] = data['close'].diff().fillna(0)
data['returns+1'] = data['returns'].shift(-1)

# Prepare the training data
train_daterange = pd.date_range('2001-01-01', '2021-12-31', freq='D')
train_df = data[data.index.isin(train_daterange)]
train_data = np.array(train_df['close'])

In [29]:
# Parameters
n_pivots=3
n_clusters = 24
n_lookback=15
hold_period=3

miner = Miner(
    n_pivots=n_pivots,
    n_clusters=n_clusters,
    n_lookback=n_lookback,
    hold_period=hold_period,
    model_type='sequential'
)

# Fit the model
miner.fit(train_data)

12.906359351657398

In [30]:
# Create a feature for the predicted labels
data['cluster_labels'] = miner.transform(data['close']).astype(int)
train_df = data[data.index.isin(train_daterange)]

In [31]:
# Fixed Parameters
fig_base = go.Figure()
for _ in range(n_clusters):
    _signals = miner.apply_holding_period(data['cluster_labels'], selected_labels=[_])
    _signals = np.where(_signals != -1, 1, 0)
    _ret = data['returns'] * _signals

    _cumsum = np.cumsum(_ret)
    fig_base.add_trace(go.Scatter(x=_cumsum.index, y=_cumsum, mode='lines', name=f' CLusters {_}'))

fig_base.update_layout(title='Cluster Returns Over Time',
                  xaxis_title='Time',
                  yaxis_title='Cumulative Returns',
                  legend_title='Clusters',
                  hovermode='closest',
                  )

fig_base.show()

### EXPERIMENT ONE : STRATEGY SELECTION
For this experiment, we would select the clusters that beat a benchmark (Buy-and-Hold)
- Profit Factor : 1
- Sharpe ratio : 
- Ulcer Performance Index : From base data
- Average Drawdown : From base data

#### PROCEDURE
1. Compute and store the returns array and martin ratio for each label/cluster, that meet the requirement (beat the benchmark; the Buy-Hold returns). Map each return to the label and direction.
2. Select the best label with by Martin ratio.
3. Compute the drawdown correlation between the returns from best label and other labels/returns. Select and store returns from correlation below a threshold value (default = .4)
4. Combine the returns based:
  - STRATEGY 1 : based on precendence, in order of descending martin ratio
  - STRATEGY 2 : concurrent returns are allowed

5. Test strategies on test_data

In [19]:
# Functions
def compute_drawdown_series(returns):
    """ Compute entire drawdown series. """
    cummax = np.maximum.accumulate(returns + 1)
    drawdowns = (cummax - (returns + 1)) / cummax
    return drawdowns

def drawdown_correlation(returns1, returns2):
    """Compute drawdown correlations and plot the results."""
    # Compute drawdown series for both return streams
    drawdowns1 = compute_drawdown_series(pd.Series(returns1))
    drawdowns2 = compute_drawdown_series (pd.Series(returns2))
    
    # Combine drawdowns into a DataFrame
    data = np.column_stack([drawdowns1, drawdowns2])
    df_drawdowns = pd.DataFrame(data, columns=['Strategy 1', 'Strategy 2'])
    
    # Calculate correlation matrix
    correlation_matrix = df_drawdowns.corr()
    
    # Plotting the correlation matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt=".2f")
    plt.title("Drawdown Correlation Matrix")
    plt.show()

    # Combined Strategies
    # Calculate and plot the cumulative sum of the combined returns
    s1 = pd.Series(returns1)
    s2 = pd.Series(returns2)

    _s1 = s1.copy()
    _s2 = s2.copy()
    _s1.loc[_s1 == 0] = np.nan
    _s2.loc[_s2 == 0] = np.nan
    combined_returns_s1 = _s1.combine_first(s2)  # return1 takes precedence over return2
    combined_returns_s2 = _s2.combine_first(s1)  # return1 takes precedence over return2
    cumulative_returns_1 = combined_returns_s1.cumsum()
    cumulative_returns_2 = combined_returns_s2.cumsum()

    plt.plot(np.cumsum(returns1))
    plt.plot(np.cumsum(returns2))
    plt.plot(cumulative_returns_1, label='Combined 1-Pred')
    plt.plot(cumulative_returns_2, label='Combined 2-Pred')
    plt.legend()
    plt.show()
    
    return correlation_matrix

def drawdown_correlation_matrix(*args):
    """Compute drawdown correlations for any number of labeled returns and plot the results."""
     # Dictionary to hold the drawdowns with labels
    drawdowns_dict = {}
    
    # Compute drawdown series for each returns array in kwargs
    for return_dict in args:
        for label, returns in return_dict.items():
            drawdown_series = compute_drawdown_series(pd.Series(returns))
            drawdowns_dict[label] = drawdown_series

    # Convert the dictionary of drawdowns to a DataFrame
    df_drawdowns = pd.DataFrame(drawdowns_dict)
    
    # Calculate correlation matrix
    correlation_matrix = df_drawdowns.corr()
    
    # Plotting the correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt=".2f")
    plt.title("Drawdown Correlation Matrix")
    plt.show()
    
    return correlation_matrix

def x_matrix(*args, function=compute_drawdown_series):
    """Compute drawdown correlations for any number of labeled returns and plot the results."""
     # Dictionary to hold the drawdowns with labels
    drawdowns_dict = {}
    
    # Compute drawdown series for each returns array in kwargs
    for return_dict in args:
        for label, returns in return_dict.items():
            drawdown_series = function(pd.Series(returns))
            drawdowns_dict[label] = drawdown_series

    # Convert the dictionary of drawdowns to a DataFrame
    df_drawdowns = pd.DataFrame(drawdowns_dict)
    
    # Calculate correlation matrix
    correlation_matrix = df_drawdowns.corr()
    
    # Plotting the correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt=".2f")
    plt.title("Drawdown Correlation Matrix")
    plt.show()
    
    return correlation_matrix

In [20]:
# Baseline Metrics
baseline_returns = train_df['close'].diff().fillna(0)
baseline_profit_factor = max(qt.stats.profit_factor(baseline_returns), 1)
baseline_sharpe_ratio = max(qt.stats.sharpe(baseline_returns), 0)
baseline_upi = max(qt.stats.ulcer_performance_index(baseline_returns), 1)
baseline_max_dd = qt.stats.max_drawdown(baseline_returns) # not added yet

print(f"baseline_profit_factor : {baseline_profit_factor}" )
print(f"baseline_sharpe_ratio : {baseline_sharpe_ratio}" )
print(f"baseline_upi : {baseline_upi}" )
print(f"baseline_max_dd : {baseline_max_dd}" )

baseline_profit_factor : 1
baseline_sharpe_ratio : 0
baseline_upi : 1
baseline_max_dd : -0.3915354898763782


In [21]:
# Filter clusters per performance
cluster_results = {
    'name' : [],
    'label' : [],
    'direction' : [],
    'profit_factor' : [],
    'sharpe_ratio': [],
    'upi' :[],
    'max_dd' : [], 
}

returns_long = {}
returns_short = {}

# For each cluster label
for cluster_label in range(n_clusters):
    _signals :np.ndarray = miner.apply_holding_period(train_df['cluster_labels'], selected_labels=[cluster_label])
    _signals = _signals != -1
    _signals = _signals.astype(int)

    # Test the returns as a Long and Short model
    for direction in [1, -1]:
        model_name = f"label_{cluster_label}_{'long' if direction > 0 else 'short'}"
        
        # Compute the returns
        _ret = train_df['returns'] * _signals * direction

        # Compute the kpis
        _pf = qt.stats.profit_factor(_ret)
        _sharpe = qt.stats.rolling_sharpe(_ret).mean()
        _upi = qt.stats.ulcer_performance_index(_ret)
        _max_dd = qt.stats.to_drawdown_series(_ret).mean()

        # Append results
        if ((_pf > baseline_profit_factor) and
            (_sharpe > baseline_sharpe_ratio) and
            (_upi > baseline_upi) and 
            (_max_dd > baseline_max_dd)):

            cluster_results['name'].append(model_name)
            cluster_results['label'].append(cluster_label)
            cluster_results['direction'].append(direction)
            cluster_results['profit_factor'].append(_pf)
            cluster_results['sharpe_ratio'].append(_sharpe)
            cluster_results['upi'].append(_upi)
            cluster_results['max_dd'].append(_max_dd)

            if direction > 0:
                returns_long[model_name] = _ret
            else:
                returns_short[model_name] = _ret

# Convert cluster_results into a dataframe
cluster_performance = pd.DataFrame(cluster_results)
returns_long.keys(), returns_short.keys()
        

(dict_keys(['label_1_long', 'label_2_long', 'label_7_long', 'label_16_long', 'label_19_long', 'label_23_long']),
 dict_keys(['label_4_short', 'label_9_short', 'label_17_short', 'label_18_short']))

In [22]:
# Select the best label with by Martin ratio.
best_label = cluster_performance.sort_values('upi').iloc[0]['name']

In [23]:
# train_data = data[(data.index.year >= train_daterange_start) & (data.index.year <= train_daterange_end)]
# test_data = data[(data.index.year > train_daterange_end)]

In [24]:
# fig = go.Figure()

# for cluster_index in range(n_clusters):
#     cluster_backtest = train_data.loc[train_data['cluster_labels'] == cluster_index, 'returns+1']
#     cumsum_backtest = np.cumsum(cluster_backtest)
#     fig.add_trace(go.Scatter(x=cumsum_backtest.index, y=cumsum_backtest, mode='lines', name=f'Cluster {cluster_index}'))

# fig.update_layout(title='Cluster Returns Over Time',
#                   xaxis_title='Time',
#                   yaxis_title='Cumulative Returns',
#                   legend_title='Clusters',
#                   hovermode='closest',
#                   height=600)

# fig.show()

In [25]:
# for _ in range(-1, n_clusters):
#     backtest_insample = train_data.loc[train_data['cluster_labels'] == _, 'returns+1']
#     backtest_outsample = test_data.loc[test_data['cluster_labels'] == _, 'returns+1']

#     print(F"\n\n----- CLUSTER {_} -----")
#     print(f"IN-SAMPLE :\n\tLONG :{qt.stats.sharpe(backtest_insample)}\n\tSHORT :{qt.stats.sharpe(backtest_insample * -1)}")
#     print(f"OUT-OF-SAMPLE :\n\tLONG :{qt.stats.sharpe(backtest_outsample)}\n\tSHORT :{qt.stats.sharpe(backtest_outsample * -1)}")


In [26]:
# 78520702f91b371079341dd3fb343d534ab0ec9c
# concluded  the `pipminer/parameters` experiment; made progress on the `pipminer/targets/strategyselection` experiment

# 
