In [None]:
#Notebook description:

# this notebook is used to evaluate the market of assets and find potential assets to invest in
# with the best risk/reward characteristics. This notebook is intented to analyze a broad group of market assets
# and does NOT focus on any particular assets. Computing data for a large number of assets is computationally expensive and thus
# the analysis is relegated to other notebooks 

In [None]:
# TODO

#move all the code that you dont need to define in custom functions into the Quantapp library
#

In [None]:
#Load libraries
import logging
logger = logging.getLogger('yfinance')
logger.disabled = True
logger.propagate = False
# Load libraries
from Quantapp.Plotter import Plotter
from Quantapp.Computation import Computation
from Quantapp.EconomicData import EconomicData

import numpy as np
import json
import os
import pandas as pd
import yfinance as yf
from statsmodels.tsa.stattools import coint
from IPython.display import display
from concurrent.futures import ThreadPoolExecutor
from plotly.subplots import make_subplots
from datetime import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
import plotly.graph_objects as go
import pandas as pd
import holidays
import plotly.express as px
import concurrent.futures
from plotly.subplots import make_subplots
import plotly.graph_objects as go

#shut down warnings
import warnings
warnings.filterwarnings("ignore")

time_frame_week = 7
time_frame_short = 21
time_frame_mid   = 50
time_frame_long = 200
interval = '1d'
period     = '10y'

risk_free_rate = 0.02 / 252  # Annualized risk-free rate divided by trading days
benchmark = 'SPY'

qc = Computation()
qp = Plotter()
qe = EconomicData()

                

In [None]:
#Load custom functions

def create_sortino_negative_indicators(sortino_diff_50, sortino_diff_200):
    """
    Creates a DataFrame indicating whether the latest Sortino ratio difference for each ticker is less than zero
    for 21-day, 50-day, and 200-day rolling windows.

    Parameters:
        sortino_diff_21 (pd.DataFrame): DataFrame of 21-day Sortino differences.
        sortino_diff_50 (pd.DataFrame): DataFrame of 50-day Sortino differences.
        sortino_diff_200 (pd.DataFrame): DataFrame of 200-day Sortino differences.

    Returns:
        pd.DataFrame: DataFrame with tickers as rows and columns ['21_Day', '50_Day', '200_Day'],
                      with True indicating the latest Sortino difference is <0, and False otherwise.
    """
    # Extract the latest row from each Sortino difference DataFrame
    latest_50 = sortino_diff_50.iloc[-1]
    latest_200 = sortino_diff_200.iloc[-1]

    # Create a new DataFrame with indicators
    indicators_df = pd.DataFrame({
        '50_Day': latest_50 > 0,
        '200_Day': latest_200 > 0
    })

    # Reset index to have tickers as a column
    indicators_df = indicators_df.reset_index()
    indicators_df.columns = ['Ticker',  'Relative performance: 50 Day Sortino (Benchmark - asset)', 'Relative performance: 200 Day Sortino (Benchmark - asset)']

    return indicators_df

def calculate_z_scores(rolling_sortino_ratio):
    """
    Calculates the z-scores for the latest row of the rolling Sortino ratio DataFrame.

    Parameters:
        rolling_sortino_ratio (pd.DataFrame): DataFrame of rolling Sortino ratios, with tickers as columns.

    Returns:
        pd.Series: A Series of z-scores for the latest values.
    """
    mean = rolling_sortino_ratio.mean()
    std = rolling_sortino_ratio.std()
    latest_values = rolling_sortino_ratio.iloc[-1]
    z_scores = (latest_values - mean) / std
    return z_scores

def categorize_z_score(z):
    """
    Categorizes a z-score into integer buckets.

    Integer categories (based on z-score):
      3  => z > 3
      2  => 2 < z <= 3
      1  => 1 < z <= 2
      0  => -1 < z <= 1
     -1  => -2 < z <= -1
     -2  => -3 < z <= -2
     -3  => z <= -3

    Parameters:
        z (float): The z-score to categorize.

    Returns:
        int: The integer category.
    """
    if z > 3:
        return 3
    elif z > 2:
        return 2
    elif z > 1:
        return 1
    elif z > -1:
        return 0
    elif z > -2:
        return -1
    elif z > -3:
        return -2
    else:
        return -3

def create_sortino_std_deviation_table(rolling_sortino_ratio):
    """
    Creates a DataFrame indicating how many standard deviations the latest Sortino ratio 
    for each ticker is above or below its historical mean, using integer buckets.

    Parameters:
        rolling_sortino_ratio (pd.DataFrame): DataFrame containing the rolling 
                                                Sortino ratios for each ticker.

    Returns:
        pd.DataFrame: DataFrame with 'Ticker' and 'Std Dev Direction' columns.
    """
    # Obtain the z-scores using the separated function
    z_scores = calculate_z_scores(rolling_sortino_ratio)
    # Categorize each z-score using the separate categorize_z_score function
    categories = z_scores.apply(categorize_z_score)

    # Create and return the deviation table DataFrame
    deviation_table = pd.DataFrame({
        'Ticker': rolling_sortino_ratio.columns.tolist(),
        'Std Dev Direction': categories.tolist()
    })

    return deviation_table

def create_price_std_deviation_table(price_data, window_sizes=[21, 50, 200]):
    """
    Creates a DataFrame indicating how many standard deviations the latest price is
    above or below its rolling mean over specified window sizes, using integer buckets.

    Integer categories (based on z-score):
      3  => z > 3
      2  => 2 < z <= 3
      1  => 1 < z <= 2
      0  => -1 < z <= 1
     -1  => -2 < z <= -1
     -2  => -3 < z <= -2
     -3  => z <= -3
    """
    # Initialize dictionary to store deviation categories
    deviation_data = {'Ticker': price_data.columns.tolist()}

    for window in window_sizes:
        categories = []
        for ticker in price_data.columns:
            ticker_prices = price_data[ticker].dropna()
            if len(ticker_prices) >= window:
                rolling_mean = ticker_prices.rolling(window=window).mean()
                rolling_std = ticker_prices.rolling(window=window).std()

                latest_price = ticker_prices.iloc[-1]
                latest_mean = rolling_mean.iloc[-1]
                latest_std = rolling_std.iloc[-1]

                if latest_std == 0 or pd.isna(latest_std):
                    category = 'Insufficient Data'
                else:
                    z_score = (latest_price - latest_mean) / latest_std

                    def categorize_z(z):
                        if z > 3:
                            return 3
                        elif z > 2:
                            return 2
                        elif z > 1:
                            return 1
                        elif z > -1:
                            return 0
                        elif z > -2:
                            return -1
                        elif z > -3:
                            return -2
                        else:
                            return -3

                    category = categorize_z(z_score)
            else:
                category = 'Insufficient Data'
            categories.append(category)
        deviation_data[f'Std Dev Direction for {window}_Day Price'] = categories

    deviation_table = pd.DataFrame(deviation_data)
    return deviation_table

def plot_combined_table(df, title='Combined Sortino Indicators'):
    """
    Plots a combined table with indicators for Sortino differences and standard deviation categories,
    highlighting the direction of deviations (positive or negative).
    
    Parameters:
        df (pd.DataFrame): Combined DataFrame with 'Ticker', Sortino differences, and deviation categories.
        title (str): Title of the table.
    """
    # Define color mappings for positive and negative deviations
    positive_deviation_colors = {
        '>+3 SD': 'lightgreen',
        '+2-3 SD': 'yellow',
        '+1-2 SD': 'orange',
        '+<1 SD': 'white'
    }
    
    negative_deviation_colors = {
        '<-3 SD': 'lightcoral',
        '-2-3 SD': 'coral',
        '-1-2 SD': 'lightblue',
        '-<1 SD': 'white'
    }
    
    # Initialize fill colors based on deviation categories and Sortino differences
    fill_colors = []
    for _, row in df.iterrows():
        row_colors = []
        for col in df.columns:
            if col == 'Ticker':
                row_colors.append('lightgrey')  # Default color for Ticker column
            elif 'Relative performance' in col:
                if row[col]:  # Underperforming if True
                    row_colors.append('lightgreen')  # Highlight underperforming assets
                else:
                    row_colors.append('white')        # Default color
            else:
                deviation = row[col]
                if deviation.startswith('+'):
                    # Positive Deviation
                    color = positive_deviation_colors.get(deviation, 'white')
                elif deviation.startswith('-'):
                    # Negative Deviation
                    color = negative_deviation_colors.get(deviation, 'white')
                else:
                    color = 'white'  # Default color for any other case
                row_colors.append(color)
        fill_colors.append(row_colors)
    
    # Transpose fill_colors to match Plotly's column-wise format
    fill_colors_transposed = list(map(list, zip(*fill_colors)))
    
    # Replace boolean values with descriptive text for Sortino differences
    display_df = df.copy()
    for col in df.columns:
        if 'Relative performance' in col:
            display_df[col] = display_df[col].apply(lambda x: 'Underperforming' if x else 'Overperforming')
    
    # Create the Plotly table
    fig = go.Figure(data=[go.Table(
        header=dict(
            values=['<b>' + col.replace('_', ' ') + '</b>' for col in display_df.columns],
            fill_color='paleturquoise',
            align='center',
            font=dict(color='black', size=12)
        ),
        cells=dict(
            values=[display_df[col] for col in display_df.columns],
            fill_color=fill_colors_transposed,
            align='center',
            font=dict(color='black', size=11)
        )
    )])
    
    # Update layout for aesthetics
    fig.update_layout(
        title=title,
        template='plotly_white',
        height=800,
        margin=dict(l=50, r=50, t=80, b=200)  # Increased bottom margin for legend
    )
    
    # Add a comprehensive legend using annotations
    legend_text = (
        "<b>Legend:</b><br>"
        "<b>Deviation Directions:</b><br>"
        "Light Green: >+3 SD (Significantly Above Mean)<br>"
        "Yellow: +2-3 SD (Above Mean)<br>"
        "Orange: +1-2 SD (Slightly Above Mean)<br>"
        "Light Coral: <-3 SD (Significantly Below Mean)<br>"
        "Coral: -2-3 SD (Below Mean)<br>"
        "Light Blue: -1-2 SD (Slightly Below Mean)<br>"
        "White: Within 1 SD<br><br>"
        "<b>Performance Indicators:</b><br>"
        "Light Green: Underperforming<br>"
        "White: Overperforming"
    )
    
    fig.add_annotation(
        text=legend_text,
        showarrow=False,
        xref="paper", yref="paper",
        x=0.5, y=-0.3,
        xanchor='center',
        yanchor='top',
        font=dict(color='black', size=12)
    )
    
    # Show the table
    fig.show()

def compute_rolling_sortino_ratios(df, n, risk_free_rate=0.0):
    """
    Computes the rolling "n" day Sortino ratios for a DataFrame of stock prices.

    Parameters:
        df (pd.DataFrame): DataFrame containing stock prices with ticker symbols as columns.
        n (int): The window size for the rolling calculation.
        risk_free_rate (float): The risk-free rate for the Sortino ratio calculation (default is 0.0).

    Returns:
        pd.DataFrame: DataFrame containing the rolling "n" day Sortino ratios for each ticker symbol.
    """
    # Calculate daily returns
    returns = df.pct_change()

    # Calculate excess returns
    excess_returns = returns - risk_free_rate / 252

    # Calculate downside deviation
    def downside_deviation(x):
        negative_returns = x[x < 0]
        return np.sqrt((negative_returns ** 2).mean())

    rolling_downside_dev = excess_returns.rolling(window=n).apply(downside_deviation, raw=False)

    # Calculate rolling mean of excess returns
    rolling_mean_excess_returns = excess_returns.rolling(window=n).mean()
    
    # Calculate rolling Sortino ratio
    rolling_sortino_ratio = rolling_mean_excess_returns / rolling_downside_dev

    return rolling_sortino_ratio

def compute_rolling_sortino_ratios_benchmark_minus_asset(df,benchmark_ticker, n, risk_free_rate=0.0):
    """
    Computes the rolling "n" day Sortino ratios for a DataFrame of stock prices.

    Parameters:
        df (pd.DataFrame): DataFrame containing stock prices with ticker symbols as columns.
        n (int): The window size for the rolling calculation.
        risk_free_rate (float): The risk-free rate for the Sortino ratio calculation (default is 0.0).

    Returns:
        pd.DataFrame: DataFrame containing the rolling "n" day Sortino ratios for each ticker symbol.
    """
    # Calculate daily returns
    returns = df.pct_change()

    # Calculate excess returns
    excess_returns = returns - risk_free_rate / 252

    # Calculate downside deviation
    def downside_deviation(x):
        negative_returns = x[x < 0]
        return np.sqrt((negative_returns ** 2).mean())

    rolling_downside_dev = excess_returns.rolling(window=n).apply(downside_deviation, raw=False)

    # Calculate rolling mean of excess returns
    rolling_mean_excess_returns = excess_returns.rolling(window=n).mean()
    
    # Calculate rolling Sortino ratio
    rolling_sortino_ratio = rolling_mean_excess_returns / rolling_downside_dev
    
    benchmark_sortino = rolling_sortino_ratio[benchmark_ticker]
    rolling_sortino_ratio = rolling_sortino_ratio.sub(benchmark_sortino, axis=0)
    benchmark_minus_asset = -rolling_sortino_ratio
    rolling_sortino_ratio = benchmark_minus_asset
    #rolling_sortino_ratio.columns = ['Benchmark_Minus_' + col for col in rolling_sortino_ratio.columns]

    return rolling_sortino_ratio

def simplify_datetime_index(series):
    """
    Simplifies the DateTime index of a Series to contain only the date (YYYY-MM-DD),
    maintaining it as a DateTimeIndex without timezone information.
    
    Parameters:
        series (pd.Series): The input Series with a DateTimeIndex.
    
    Returns:
        pd.Series: The Series with the DateTime index simplified to YYYY-MM-DD.
    """
    if not isinstance(series.index, pd.DatetimeIndex):
        raise TypeError("The Series index must be a DateTimeIndex.")
    
    # Remove timezone information if present
    if series.index.tz is not None:
        series = series.copy()
        series.index = series.index.tz_convert('UTC').tz_localize(None)
    
    # Normalize the index to remove the time component
    series.index = series.index.normalize()
    
    return series

def plot_time_series(all_series, time_frame='1y', title='Time Series Data of Ticker Symbols'):
    """
    Plots the time series data for a DataFrame where each column is a ticker symbol and each row is a price.

    Parameters:
        all_series (pd.DataFrame): DataFrame containing time series data with ticker symbols as columns and prices as rows.
        title (str): Title for the plot (default is 'Time Series Data of Ticker Symbols').
    """
    # Filter out the data based on the specified time frame
    if time_frame == '1y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=1)]
    elif time_frame == '3y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=3)]
    elif time_frame == '5y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=5)]
    elif time_frame == '10y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=10)]
    else:
        # Error handling for invalid time frame
        print("Error: Invalid time frame")
        return
    
    # Create a Plotly figure
    fig = px.line(all_series, title=title)
    
    # Add a dashed horizontal line at zero
    fig.add_hline(y=0, line_dash='dash', line_color='red')
    
    # Update layout for the figure
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title='Price',
        template='plotly_dark',
        xaxis=dict(
            tickangle=-45,
            showgrid=True,
            zeroline=True  # Add zero line for x-axis
        ),
        yaxis=dict(
            showgrid=True,
            zeroline=True  # Add zero line for y-axis
        )
    )

    return fig
    
def generate_series(tickers, columns=['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], period=period, interval=interval):
    """
    Generate a DataFrame or Series containing the specified columns for the given tickers.

    Parameters:
    - tickers: List of ticker symbols or a single ticker symbol.
    - columns: List of columns to retrieve or a single column to retrieve (default is ['Close']).
    - period: Data period to retrieve (default is '1y').
    - interval: Data interval to retrieve (default is '1d').

    Returns:
    - pd.DataFrame or pd.Series with the specified columns for the given tickers.
    """
    # Ensure tickers and columns are lists
    if isinstance(tickers, str):
        tickers = [tickers]
    if isinstance(columns, str):
        columns = [columns]

    tickers = [ticker.replace('.', '-') for ticker in tickers]
    try:
        df = yf.download(tickers, period=period, interval=interval, progress=False)
    except Exception as e:
        print(f"An error occurred while fetching data: {e}")
        return pd.DataFrame()
    
    # Check if the specified columns exist in the DataFrame
    missing_columns = [col for col in columns if col not in df.columns.get_level_values(0)]
    if missing_columns:
        print(f"Error: The following columns are not available: {missing_columns}")
        print(f"Possible columns are: {df.columns.get_level_values(0).unique().tolist()}")
        return pd.DataFrame()
    
    df = df[columns]
    
    # Handle the case where there is only one ticker and one column
    if len(tickers) == 1 and len(columns) == 1:
        return df[columns[0]].rename(tickers[0].replace('-', '.'))
    
    # Handle the case where there is only one ticker
    if len(tickers) == 1:
        df.columns = [col.replace('-', '.') for col in df.columns]
    else:
        # If only one column is selected, return a DataFrame with tickers as column names
        if len(columns) == 1:
            df = df[columns[0]]
            df.columns = [col.replace('-', '.') for col in df.columns]
        else:
            # Flatten the multi-level columns if multiple tickers are requested
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = pd.MultiIndex.from_tuples([(col[1], col[0]) for col in df.columns.values])
            else:
                df.columns = pd.MultiIndex.from_tuples([(col.split('.')[0], col.split('.')[1]) for col in df.columns])
    
    return df

def filter_assets_by_positive_spread_std(asset_spreads):
    spreads = asset_spreads
    positive_spreads = spreads[spreads >= 0] 
    
    mean = positive_spreads.mean()
    std_dev = positive_spreads.std()

    latest_spread = spreads.iloc[-1]
    threshold = mean + std_dev

    return latest_spread>=threshold

def filter_assets_below_negative_std(asset_spreads):
    if not isinstance(asset_spreads, pd.Series):
        raise TypeError("asset_spreads must be a pandas Series")

    negative_spreads = asset_spreads[asset_spreads < 0]
    if negative_spreads.empty:
        return pd.Series(dtype=bool)  
    
    mean_negative = negative_spreads.mean()
    std_dev_negative = negative_spreads.std()

    threshold_negative = mean_negative - 0.75 * std_dev_negative
    return asset_spreads < threshold_negative

def get_sector_info(ticker):
    try:
        stock = yf.Ticker(ticker)
        sector = stock.info.get('sector', 'N/A')
        sub_industry = stock.info.get('industry', 'N/A')
        return {'Ticker': ticker, 'Sector': sector, 'Sub-Industry': sub_industry}
    except Exception as e:
        #print(f"Error fetching data for {ticker}: {e}")
        return {'Ticker': ticker, 'Sector': 'N/A', 'Sub-Industry': 'N/A'}

def fetch_ticker_info(ticker):
    info = get_sector_info(ticker)
    print(info)
    print(yf.Ticker(ticker).info)
    #market_cap = yf.Ticker(ticker).info.get('marketCap')
    #return info['Sector'], info['Sub-Industry'], market_cap

def get_market_caps(table):
    #print("Starting market cap retrieval process...")
    
    tickers = table['Symbol'].tolist()
    #print(f"Original tickers: {tickers[:10]}...")  # Print first 10 for brevity

    # Optimize ticker adjustment
    tickers = ['BRK-B' if symbol == 'BRK.B' else 'BF-B' if symbol == 'BF.B' else symbol for symbol in tickers]
    #print(f"Adjusted tickers: {tickers[:10]}...")  # Print first 10 for brevity

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(fetch_ticker_info, tickers))

    # Unpack results
    sectors, sub_industries, market_caps = zip(*results)

    table['Sector'] = sectors
    table['Sub-Industry'] = sub_industries
    table['Market Cap'] = market_caps
    
    #print("Market cap retrieval process completed.")
    return table

def get_market_cap_threshold_companies(info):
    """
    Calculates market cap rankings and identifies companies contributing to specified cumulative market cap thresholds.

    Parameters:
        info (pd.DataFrame): DataFrame containing at least 'Symbol' and 'Market Cap' columns.

    Returns:
        dict: A dictionary where keys are threshold labels (e.g., 'Top 50%') and values are lists of company dictionaries
              containing 'Symbol', 'Market Cap', 'Market Cap %', 'Cumulative Market Cap %', and 'Rank'.
    """
    # Step 1: Create a DataFrame of market caps
    market_caps = pd.DataFrame(info[['Symbol', 'Market Cap']])
    
    # Step 2: Sort companies by market cap in descending order
    market_caps = market_caps.sort_values(by='Market Cap', ascending=False).reset_index(drop=True)

    # Step 3: Calculate total market cap
    total_market_cap = market_caps['Market Cap'].sum()

    # Step 4: Calculate individual Market Cap %
    market_caps['Market Cap %'] = (market_caps['Market Cap'] / total_market_cap) * 100

    # Step 5: Calculate cumulative Market Cap %
    market_caps['Cumulative Market Cap %'] = market_caps['Market Cap %'].cumsum()

    # Step 6: Assign Rank
    market_caps['Rank'] = market_caps.index + 1

    # Step 7: Define thresholds
    thresholds = [50, 80]  # You can adjust or add more thresholds as needed
    threshold_dict = {}

    for threshold in thresholds:
        # Find the first index where cumulative market cap meets or exceeds the threshold
        idx = market_caps[market_caps['Cumulative Market Cap %'] >= threshold].index[0]

        # Select companies up to that index
        companies = market_caps.loc[:idx, ['Symbol', 'Market Cap', 'Market Cap %', 'Cumulative Market Cap %', 'Rank']]

        # Convert to list of dictionaries
        companies_list = companies.to_dict('records')

        # Add to the threshold dictionary with appropriate key
        threshold_key = f'Top {threshold}%'
        threshold_dict[threshold_key] = companies_list

    return threshold_dict

def remove_weekends_and_holidays(df, country='US'):
    """
    Removes weekend and holiday rows from a DataFrame with a DateTime index.

    Parameters:
        df (pd.DataFrame): DataFrame with DateTime index.
        country (str): Country code for holidays. Default is 'US'.

    Returns:
        pd.DataFrame: DataFrame without weekend and holiday data.
    """
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("DataFrame index must be a DateTimeIndex")

    # Remove weekends
    df_weekdays = df[df.index.dayofweek < 5]

    # Get holidays
    country_holidays = holidays.CountryHoliday(country)

    # Remove holidays
    df_clean = df_weekdays[~df_weekdays.index.normalize().isin(country_holidays)]

    return df_clean

mode='standard'

def create_and_concat_spreads(dataframes, benchmark_series, time_frame, mode):
    benchmark_spreads = [
        qc.create_spreads(df, benchmark_series, time_frame=time_frame, mode=mode)
        for df in dataframes
    ]
    combined_spreads = pd.concat(benchmark_spreads, axis=1)
    combined_spreads = combined_spreads.loc[:, ~combined_spreads.columns.duplicated()]
    return combined_spreads


def create_pairwise_spreads(etf_dataframes, window=20):
    """
    Creates pairwise spreads of rolling returns between assets within each category.
    
    Parameters:
        etf_dataframes (dict): Dictionary of DataFrames where each key is a category 
                              and values are DataFrames with ticker columns
        window (int): Rolling window period for calculating returns (default=20)
    
    Returns:
        dict: Dictionary where keys match the input categories and values are DataFrames
              containing the spreads between rolling returns of unique asset pairs
    """
    pairwise_spreads = {}

    for category, df in etf_dataframes.items():
        # Skip categories with only one asset
        if df.shape[1] <= 1:
            continue
            
        # Get valid tickers in this category (those without all NaN values)
        valid_tickers = [ticker for ticker in df.columns if not df[ticker].isna().all()]
        
        if len(valid_tickers) < 2:
            continue
        
        # Create an empty DataFrame for this category's spreads
        category_spreads = pd.DataFrame(index=df.index)
        
        # For each unique pair of tickers, compute the spread of rolling returns
        for i in range(len(valid_tickers)):
            for j in range(i+1, len(valid_tickers)):  # Start from i+1 to avoid duplicates
                ticker1, ticker2 = valid_tickers[i], valid_tickers[j]
                
                # Find valid data for both assets
                valid_data = df[[ticker1, ticker2]].dropna()
                if valid_data.empty:
                    continue
                
                # Calculate rolling returns for both assets
                returns1 = df[ticker1].pct_change(window)
                returns2 = df[ticker2].pct_change(window)
                
                # Calculate and store the spread between rolling returns
                spread_name = f"{ticker1}-{ticker2}"
                category_spreads[spread_name] = returns1 - returns2
        
        # Store only if we have valid spreads
        if not category_spreads.empty:
            pairwise_spreads[category] = category_spreads
    
    return pairwise_spreads


In [None]:
#Load: retrieve all tickers / prices 
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Load: retrieve all tickers / prices / spreads for major markets
sp500 = yf.download('SPY', period=period, interval=interval,progress=False)
risk_free_rate = yf.download('^IRX', period=period, interval=interval, progress=False)
market_assets = qe.get_market_assets()

#Load: retrieve prices
indices_df             = generate_series(market_assets['INDICES'], columns=['Close'])
sectors_df            = generate_series(market_assets['SECTORS'], columns=['Close'])
industries_df         = generate_series(market_assets['INDUSTRIES'], columns=['Close'])
bonds_df = generate_series(market_assets['BONDS'], columns=['Close'])
precious_metals_df = generate_series(market_assets['PRECIOUS_METALS'], columns=['Close'])
crypto_df = generate_series(market_assets['CRYPTO'], columns=['Close'])
#crypto_df = crypto_df.loc[sp500.index]
energy_df = generate_series(market_assets['ENERGY'], columns=['Close'])
foreign_markets_df = generate_series(market_assets['FOREIGN_MARKETS'], columns=['Close'])
primary_sector_etfs_df = generate_series(market_assets['PRIMARY_SECTORS'], columns=['Close'])
major_currency_pairs_df = generate_series(market_assets['MAJOR_CURRENCY_PAIRS'], columns=['Close'])
minor_currency_pairs_df = generate_series(market_assets['MINOR_CURRENCY_PAIRS'], columns=['Close'])
exotic_currency_pairs_df = generate_series(market_assets['EXOTIC_CURRENCY_PAIRS'], columns=['Close'])
cross_currency_pairs_df = generate_series(market_assets['CROSS_CURRENCY_PAIRS'], columns=['Close']) 
#major_currency_pairs_df = major_currency_pairs_df.loc[sp500.index]
#minor_currency_pairs_df = minor_currency_pairs_df.loc[sp500.index]
#exotic_currency_pairs_df = exotic_currency_pairs_df.loc[sp500.index]
#cross_currency_pairs_df = cross_currency_pairs_df.loc[sp500.index]
capitalizations_df = generate_series(market_assets['CAPITALIZATIONS'], columns=['Close'])
innovation_df = generate_series(market_assets['INNOVATION'], columns=['Close'])
long_leveraged_df = generate_series(market_assets['LONG_LEVERAGE'], columns=['Close'])
short_leveraged_df = generate_series(market_assets['SHORT_LEVERAGE'], columns=['Close'])
single_factor_df = generate_series(market_assets['SINGLE_FACTOR'], columns=['Close'])
multi_factor_df = generate_series(market_assets['MULTI_FACTOR'], columns=['Close'])
minimum_volatility_df = generate_series(market_assets['MINIMUM_VOLATILITY'], columns=['Close'])


etf_prices = pd.concat([
    indices_df,
    sectors_df,
    industries_df,
    bonds_df,
    precious_metals_df,
#    crypto_df,
    energy_df,
    foreign_markets_df,
    primary_sector_etfs_df,
#    major_currency_pairs_df,
#    minor_currency_pairs_df,
#    exotic_currency_pairs_df,
    cross_currency_pairs_df,
    capitalizations_df,
    innovation_df,
    long_leveraged_df,
    short_leveraged_df,
    single_factor_df,
    multi_factor_df,
    minimum_volatility_df
], axis=1).loc[:, lambda df: ~df.columns.duplicated()]

benchmark_series           = etf_prices[benchmark]

# List of dataframes for week and short time frames
etf_dataframes = {
    "indices": indices_df, 
    "sectors": sectors_df, 
    "industries": industries_df, 
    "bonds": bonds_df, 
    "precious_metals": precious_metals_df, 
   # "crypto": crypto_df, 
    "energy": energy_df,
    "foreign_markets": foreign_markets_df, 
    "primary_sector_etfs": primary_sector_etfs_df, 
 #   "major_currency_pairs": major_currency_pairs_df, 
 #   "minor_currency_pairs": minor_currency_pairs_df, 
 #   "exotic_currency_pairs": exotic_currency_pairs_df, 
 #   "cross_currency_pairs": cross_currency_pairs_df, 
    "capitalizations": capitalizations_df, 
    "innovation": innovation_df,
    "long_leveraged": long_leveraged_df,
    "short_leveraged": short_leveraged_df,
    "single_factor": single_factor_df,
    "multi_factor": multi_factor_df,
    "minimum_volatility": minimum_volatility_df
}

print('Computing the correlation matrix...')
etf_dataframes_correlation_matrices = {key: value.corr() for key, value in etf_dataframes.items()}

In [None]:
#Computations...

#1. return spreads between benchmark and all assets
#2. the sortino ratios for all assets
#3. the spreads between the sortino ratios of all assets and the benchmark

print("Computing spreads between benchmark and all assets...")
#compute the weekly, short, mid, and long term returns for the benchmark
sp500_monthly_returns = qc.calculate_returns(sp500,frequency='monthly')
sp500_weekly_returns = qc.calculate_returns(sp500,frequency='weekly')
sp500_daily_returns = qc.calculate_returns(sp500,frequency='daily')


#etf_dataframes to list
#-----------------------------------------------------------

#the spreads between the benchmark and all assets
#Calculate: spreads


print("Computing spreads between benchmark and all assets...")
# Create and concatenate spreads for the weekly time frame
benchmark_minus_etf_week = create_and_concat_spreads(
    list(etf_dataframes.values()), benchmark_series, time_frame=time_frame_week, mode=mode
)

print(f"Computing spreads between benchmark and all assets for the short time frame ({time_frame_short} days)...")
# Create and concatenate spreads for the short time frame
benchmark_minus_etf_short = create_and_concat_spreads(
    list(etf_dataframes.values()), benchmark_series, time_frame=time_frame_short, mode=mode
)

print(f"Computing spreads between benchmark and all assets for the mid time frame ({time_frame_mid} days)...")
benchmark_minus_etf_mid = create_and_concat_spreads(
    list(etf_dataframes.values()), benchmark_series, time_frame=time_frame_mid, mode=mode
)

print(f"Computing spreads between benchmark and all assets for the long time frame ({time_frame_long} days)...")
benchmark_minus_etf_long = create_and_concat_spreads(
    list(etf_dataframes.values()), benchmark_series, time_frame=time_frame_long, mode=mode
)
print(" ")
print("-----------------------------------------------------------------------")


#-----------------------------------------------------------

#the sortino ratios for all assets
print(f"computing the rolling sortino ratios for all assets for the short time frame ({time_frame_short} days)...")
rolling_sortino_ratios_etf_21 = compute_rolling_sortino_ratios(etf_prices, n=21)

print(f"computing the rolling sortino ratios for all assets for the mid time frame ({time_frame_mid} days)...")
rolling_sortino_ratios_etf_50 = compute_rolling_sortino_ratios(etf_prices, n=50)

print(f"computing the rolling sortino ratios for all assets for the long time frame ({time_frame_long} days)...")
rolling_sortino_ratios_etf_200 = compute_rolling_sortino_ratios(etf_prices, n=200)

print(" ")
print("-----------------------------------------------------------------------")

#the spreads between the sortino ratios of all assets and the benchmark
print(f"computing the rolling sortino ratios for all assets minus the benchmark for the short time frame ({time_frame_short} days)...")
rolling_sortino_ratios_benchmark_minus_etf_21  = compute_rolling_sortino_ratios_benchmark_minus_asset(etf_prices,'SPY', n=21)

print(f"computing the rolling sortino ratios for all assets minus the benchmark for the mid time frame ({time_frame_mid} days)...")
rolling_sortino_ratios_benchmark_minus_etf_50  = compute_rolling_sortino_ratios_benchmark_minus_asset(etf_prices,'SPY', n=50)

print(f"computing the rolling sortino ratios for all assets minus the benchmark for the long time frame ({time_frame_long} days)...")
rolling_sortino_ratios_benchmark_minus_etf_200  = compute_rolling_sortino_ratios_benchmark_minus_asset(etf_prices,'SPY', n=200)

print(" ")
print("-----------------------------------------------------------------------")

print("Computing pairwise spreads between assets within each category...")
print(f"Computing pairwise spreads for the short time frame ({time_frame_short} days)...")
pairwise_spreads_21 = create_pairwise_spreads(etf_dataframes, window=time_frame_short)

print(f"Computing pairwise spreads for the mid time frame ({time_frame_mid} days)...")
pairwise_spreads_50 = create_pairwise_spreads(etf_dataframes, window=time_frame_mid)

print(f"Computing pairwise spreads for the long time frame ({time_frame_long} days)...")
pairwise_spreads_200 = create_pairwise_spreads(etf_dataframes, window=time_frame_long)


In [None]:
'''#plot pairwise spreads

#-----------------------------------------------------------
# Plot the pairwise spreads for each category
# Create interactive plots for pairwise spreads by category

def plot_pairwise_spreads(pairwise_spreads_dict, title="Pairwise Spreads by Category", time_frame=None):
    """
    Creates an interactive plot showing pairwise spreads for each category with dropdown selection.
    Uses subplots to show both the spreads over time and their z-scores.
    
    Parameters:
        pairwise_spreads_dict (dict): Dictionary where keys are categories and values are DataFrames with spread columns
        title (str): Main title for the plot
        time_frame (int): Number of days for the time frame (for display in title)
    """
    # Get list of categories
    categories = list(pairwise_spreads_dict.keys())
    
    # Create figure with subplots - one for spreads and one for z-scores
    fig = make_subplots(rows=2, cols=1, 
                        shared_xaxes=True, 
                        vertical_spacing=0.1,
                        subplot_titles=("Pairwise Spreads", "Z-Scores"),
                        row_heights=[0.7, 0.3])
    
    # Add traces for first category (will be visible)
    if categories:
        first_category = categories[0]
        first_df = pairwise_spreads_dict[first_category]
        
        # For the first category, add line traces to top subplot
        for spread in first_df.columns:
            # Add spread line to top subplot
            fig.add_trace(
                go.Scatter(
                    x=first_df.index,
                    y=first_df[spread],
                    mode="lines",
                    name=spread,
                    line=dict(width=1.5),
                    opacity=0.8,
                    hovertemplate='%{y:.4f}<extra>%{fullData.name} (%{x})</extra>'
                ),
                row=1, col=1
            )
            
            # Calculate z-score for the spread and add as bar to bottom subplot
            z_score = (first_df[spread].iloc[-1] - first_df[spread].mean()) / first_df[spread].std()
            fig.add_trace(
                go.Bar(
                    x=[spread],
                    y=[z_score],
                    name=f"Z-Score: {spread}",
                    text=f"{z_score:.2f}",
                    textposition='auto',
                    showlegend=False
                ),
                row=2, col=1
            )
    
    # Create dropdown menu items
    buttons = []
    
    for category in categories:
        # Create a list of booleans for visibility
        # True for traces of this category, False for all others
        visibility = []
        
        # Counter to track trace position across all categories
        trace_counter = 0
        
        for cat in categories:
            df = pairwise_spreads_dict[cat]
            # For each spread in the current category
            for _ in range(len(df.columns)):
                # Set visibility based on whether it's the selected category
                # Need to account for 2 traces per spread (line + bar)
                visibility.append(cat == category)  # For line trace
                visibility.append(cat == category)  # For bar trace
                trace_counter += 2
                
        time_frame_str = f" ({time_frame} Days)" if time_frame else ""
        buttons.append(
            dict(
                label=category,
                method="update",
                args=[
                    {"visible": visibility},
                    {"title": f"Pairwise Spreads for {category}{time_frame_str}"}
                ]
            )
        )
    
    # Add all other category traces (initially hidden)
    for category in categories[1:]:
        df = pairwise_spreads_dict[category]
        for spread in df.columns:
            # Add line trace (hidden)
            fig.add_trace(
                go.Scatter(
                    x=df.index,
                    y=df[spread],
                    mode="lines",
                    name=spread,
                    line=dict(width=1.5),
                    opacity=0.8,
                    visible=False,
                    hovertemplate='%{y:.4f}<extra>%{fullData.name} (%{x})</extra>'
                ),
                row=1, col=1
            )
            
            # Add z-score bar (hidden)
            z_score = (df[spread].iloc[-1] - df[spread].mean()) / df[spread].std()
            fig.add_trace(
                go.Bar(
                    x=[spread],
                    y=[z_score],
                    name=f"Z-Score: {spread}",
                    text=f"{z_score:.2f}",
                    textposition='auto',
                    visible=False,
                    showlegend=False
                ),
                row=2, col=1
            )
    
    # Display time frame in title if provided
    time_frame_str = f" ({time_frame} Days)" if time_frame else ""
    
    # Update layout with dropdown menu
    fig.update_layout(
        title=f"Pairwise Spreads for {categories[0]}{time_frame_str}",
        template="plotly_dark",
        updatemenus=[
            dict(
                active=0,
                buttons=buttons,
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.05,  # Dropdown positioned on the left
                xanchor="left",
                y=1.15,
                yanchor="top"
            )
        ],
        height=800,  # Increased height to accommodate both plots
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="left",
            x=0.4  # Increased x position to move legend further right of dropdown
        )
    )
    
    # Add a horizontal line at y=0 to show the zero level in top plot
    fig.add_hline(y=0, line_dash="dash", line_color="white", opacity=0.5, row=1, col=1)
    
    # Add horizontal lines at y=-2, 0, 2 to show standard deviation levels in bottom plot
    fig.add_hline(y=0, line_dash="dash", line_color="white", opacity=0.5, row=2, col=1)
    fig.add_hline(y=2, line_dash="dot", line_color="red", opacity=0.5, row=2, col=1)
    fig.add_hline(y=-2, line_dash="dot", line_color="red", opacity=0.5, row=2, col=1)
    
    # Update axes labels
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Spread Value", row=1, col=1)
    fig.update_yaxes(title_text="Z-Score", row=2, col=1)
    
    return fig

# Create separate figures for each time frame
print("Plotting pairwise spreads for the short time frame...")
fig_short = plot_pairwise_spreads(pairwise_spreads_21, f"Pairwise Return Spreads - Short Term", time_frame_short)
fig_short.show()

print(f"Plotting pairwise spreads for the mid time frame ({time_frame_mid} days)...")
fig_mid = plot_pairwise_spreads(pairwise_spreads_50, f"Pairwise Return Spreads - Medium Term", time_frame_mid)
fig_mid.show()

print(f"Plotting pairwise spreads for the long time frame ({time_frame_long} days)...")
fig_long = plot_pairwise_spreads(pairwise_spreads_200, f"Pairwise Return Spreads - Long Term", time_frame_long)
fig_long.show()


'''

In [None]:
#plot pairwise spreads

#-----------------------------------------------------------
# Plot the pairwise spreads for each category and time frame
# Create interactive plots for pairwise spreads by category and time frame

def plot_pairwise_spreads(pairwise_spreads_dict_by_timeframe, title="Pairwise Spreads", time_frames=None):
    """
    Creates an interactive plot showing pairwise spreads for each category and time frame with dropdown selections.
    Uses subplots to show both the spreads over time and their z-scores.
    
    Parameters:
        pairwise_spreads_dict_by_timeframe (dict): Dictionary with time frames as keys and dictionaries of category spreads as values
        title (str): Main title for the plot
        time_frames (dict): Dictionary mapping time frame keys to display names (e.g. {'short': 21})
    """
    # Get list of time frames and categories
    time_frame_keys = list(pairwise_spreads_dict_by_timeframe.keys())
    first_time_frame = time_frame_keys[0]
    categories = list(pairwise_spreads_dict_by_timeframe[first_time_frame].keys())
    first_category = categories[0]
    
    # Create figure with subplots
    fig = make_subplots(rows=2, cols=1, 
                        shared_xaxes=True, 
                        vertical_spacing=0.1,
                        subplot_titles=("Pairwise Spreads", "Z-Scores"),
                        row_heights=[0.7, 0.3])
    
    # Dictionary to track traces by their ID
    trace_indices = {}
    trace_idx = 0
    
    # Add all traces for all time frames and categories (initially hide most)
    for time_frame_key in time_frame_keys:
        for category in categories:
            df = pairwise_spreads_dict_by_timeframe[time_frame_key][category]
            
            # Store starting index for this combination
            current_combo = f"{time_frame_key}_{category}"
            trace_indices[current_combo] = []
            
            # For each spread in the category
            for spread in df.columns:
                # Add spread line
                fig.add_trace(
                    go.Scatter(
                        x=df.index,
                        y=df[spread],
                        mode="lines",
                        name=spread,
                        line=dict(width=1.5),
                        opacity=0.8,
                        visible=(time_frame_key == first_time_frame and category == first_category),
                        hovertemplate='%{y:.4f}<extra>%{fullData.name} (%{x})</extra>'
                    ),
                    row=1, col=1
                )
                trace_indices[current_combo].append(trace_idx)
                trace_idx += 1
                
                # Calculate z-score and add bar
                z_score = (df[spread].iloc[-1] - df[spread].mean()) / df[spread].std()
                fig.add_trace(
                    go.Bar(
                        x=[spread],
                        y=[z_score],
                        name=f"Z-Score: {spread}",
                        text=f"{z_score:.2f}",
                        textposition='auto',
                        visible=(time_frame_key == first_time_frame and category == first_category),
                        showlegend=False
                    ),
                    row=2, col=1
                )
                trace_indices[current_combo].append(trace_idx)
                trace_idx += 1
    
    # Create timeframe buttons
    timeframe_buttons = []
    for time_frame_key in time_frame_keys:
        tf_days = time_frames.get(time_frame_key) if time_frames else time_frame_key
        timeframe_buttons.append(
            dict(
                label=f"{tf_days} Days" if isinstance(tf_days, int) else time_frame_key,
                method="update",
                args=[
                    {"visible": [False] * trace_idx},  # Hide all traces initially
                    {"title": f"Pairwise Spreads for {first_category} ({tf_days} Days)"}
                ]
            )
        )
        # Set visibility for the selected timeframe and first category
        visible_traces = trace_indices[f"{time_frame_key}_{first_category}"]
        for i in visible_traces:
            timeframe_buttons[-1]["args"][0]["visible"][i] = True
    
    # Create category buttons for each time frame
    category_buttons = []
    for category in categories:
        category_buttons.append(
            dict(
                label=category,
                method="update",
                args=[
                    {"visible": [False] * trace_idx},  # Hide all traces initially
                    {"title": f"Pairwise Spreads for {category} ({time_frames.get(first_time_frame)} Days)"}
                ]
            )
        )
        # Set visibility for the selected category and first time frame
        visible_traces = trace_indices[f"{first_time_frame}_{category}"]
        for i in visible_traces:
            category_buttons[-1]["args"][0]["visible"][i] = True
    
    # Update layout with dropdown menus
    fig.update_layout(
        title=f"Pairwise Spreads for {first_category} ({time_frames.get(first_time_frame)} Days)",
        template="plotly_dark",
        updatemenus=[
            # Time frame dropdown
            dict(
                active=0,
                buttons=timeframe_buttons,
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.05,
                xanchor="left",
                y=1.15,
                yanchor="top",
                bgcolor="rgba(50, 50, 50, 0.7)",
                font=dict(color="white"),
                name="Time Frame"
            ),
            # Category dropdown
            dict(
                active=0,
                buttons=category_buttons,
                direction="down",
                pad={"r": 10, "t": 10},
                showactive=True,
                x=0.35,
                xanchor="left",
                y=1.15,
                yanchor="top",
                bgcolor="rgba(50, 50, 50, 0.7)",
                font=dict(color="white"),
                name="Category"
            )
        ],
        # Add annotations for the dropdowns
        annotations=[
            dict(
                text="Time Frame:",
                x=0.01,
                y=1.15,
                xref="paper",
                yref="paper",
                showarrow=False,
                font=dict(size=14)
            ),
            dict(
                text="Category:",
                x=0.3,
                y=1.15,
                xref="paper",
                yref="paper",
                showarrow=False,
                font=dict(size=14)
            )
        ],
        height=800,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="left",
            x=0.6
        )
    )
    
    # Add reference lines
    fig.add_hline(y=0, line_dash="dash", line_color="white", opacity=0.5, row=1, col=1)
    fig.add_hline(y=0, line_dash="dash", line_color="white", opacity=0.5, row=2, col=1)
    fig.add_hline(y=2, line_dash="dot", line_color="red", opacity=0.5, row=2, col=1)
    fig.add_hline(y=-2, line_dash="dot", line_color="red", opacity=0.5, row=2, col=1)
    
    # Update axes labels
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Spread Value", row=1, col=1)
    fig.update_yaxes(title_text="Z-Score", row=2, col=1)
    
    return fig

# Combine all pairwise spreads into a single dictionary by time frame
pairwise_spreads_by_timeframe = {
    'short': pairwise_spreads_21,
    'mid': pairwise_spreads_50,
    'long': pairwise_spreads_200
}

# Define time frames for display
time_frames_mapping = {
    'short': time_frame_short,  # 21
    'mid': time_frame_mid,      # 50
    'long': time_frame_long     # 200
}

# Create a single figure with time frame and category dropdowns
print("Creating interactive pairwise spreads plot with time frame and category selection...")
fig_combined = plot_pairwise_spreads(pairwise_spreads_by_timeframe, "Pairwise Return Spreads", time_frames_mapping)
fig_combined.show()

In [None]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.stattools import adfuller

# Create monthly resampled ETF dataframes
etf_dataframes_monthly = {key: value.resample('M').last() for key, value in etf_dataframes.items()}

# Create correlation matrices for each ETF dataframe using monthly data
etf_dataframes_correlation_matrices = {key: np.log(value).diff().dropna().corr() for key, value in etf_dataframes_monthly.items()}

# Function to get paired correlations in ascending order
def get_sorted_correlations(corr_matrix):
    # Get the lower triangular part of the correlation matrix (excluding diagonal)
    mask = np.tril(np.ones(corr_matrix.shape), k=-1).astype(bool)
    corr_pairs = []
    
    # Extract all pairwise correlations
    for i, row in enumerate(corr_matrix.index):
        for j, col in enumerate(corr_matrix.columns):
            if mask[i, j]:  # Only take lower triangle to avoid duplicates
                corr_pairs.append((f"{row}-{col}", corr_matrix.iloc[i, j]))
    
    # Sort by correlation value
    corr_pairs.sort(key=lambda x: x[1])
    
    # Return pair names and correlation values
    return zip(*corr_pairs)

# Function to calculate cointegration p-values for specific pairs
def get_cointegration_pvals(df, correlation_pairs):
    pairs = []
    p_values = []
    
    for pair_name in correlation_pairs:
        # Split the ticker pair
        ticker1, ticker2 = pair_name.split('-')
        
        # Skip pairs with insufficient data
        series1 = df[ticker1].dropna()
        series2 = df[ticker2].dropna()
        
        common_idx = series1.index.intersection(series2.index)
        if len(common_idx) < 10:  # Reduced minimum data points needed for monthly data
            p_values.append(1.0)  # Use 1.0 as default p-value when test can't be run
            continue
            
        s1 = series1.loc[common_idx]
        s2 = series2.loc[common_idx]
        
        try:
            # Regression to get residuals
            X = np.array(s2).reshape(-1, 1)
            X = np.hstack((np.ones(X.shape[0]).reshape(-1, 1), X))
            y = np.array(s1).reshape(-1, 1)
            
            beta = np.linalg.lstsq(X, y, rcond=None)[0]
            residuals = y - X.dot(beta)
            
            # ADF test on residuals
            adf_result = adfuller(residuals.flatten(), autolag='AIC')
            p_value = adf_result[1]
            
            p_values.append(p_value)
        except:
            p_values.append(1.0)  # Use 1.0 as default p-value when test fails
    
    return correlation_pairs, p_values

# Create subplots with 3 rows
first_key = list(etf_dataframes_correlation_matrices.keys())[0]
first_df = etf_dataframes_monthly[first_key]
first_matrix = etf_dataframes_correlation_matrices[first_key]

# Get sorted correlations
pair_names, pair_corrs = get_sorted_correlations(first_matrix)
pair_names = list(pair_names)  # Convert to list for reuse

# Get cointegration p-values for the same pairs in same order
coint_pair_names, coint_p_values = get_cointegration_pvals(first_df, pair_names)

# Convert p-values to -log10(p) for better visualization
log_p_values = [-np.log10(p) if p > 0 else 15 for p in coint_p_values] if coint_p_values else []

# Create subplots: heatmap on left, correlation and cointegration bar charts on right
fig = make_subplots(
    rows=2,
    cols=2,
    column_widths=[0.5, 0.5],
    row_heights=[0.5, 0.5],
    specs=[[{"rowspan": 2}, {}], 
           [None, {}]],
    vertical_spacing=0.1,
    horizontal_spacing=0.05,
    subplot_titles=('Monthly Correlation Matrix', 'Monthly Sorted Pairwise Correlations', 'Monthly Cointegration Test (-log10 p-value)')
)

# Add heatmap trace on left side (spanning both rows)
heatmap = go.Heatmap(
    z=first_matrix.values,
    x=first_matrix.columns,
    y=first_matrix.index,
    colorscale='RdBu_r',
    zmid=0,
    colorbar=dict(title='Correlation', y=0.5, len=0.85)
)
fig.add_trace(heatmap, row=1, col=1)

# Add correlation bar chart on top right - removing its colorbar since it uses the same scale as heatmap
bar = go.Bar(
    x=pair_names,
    y=pair_corrs,
    marker=dict(
        color=pair_corrs,
        colorscale='RdBu_r',
        showscale=False  # Hide duplicate colorbar
    )
)
fig.add_trace(bar, row=1, col=2)

# Add cointegration bar chart on bottom right
if coint_pair_names and log_p_values:
    coint_bar = go.Bar(
        x=coint_pair_names,
        y=log_p_values,
        marker=dict(
            color=log_p_values,
            colorscale='Viridis',
            colorbar=dict(title='-log10(p)', x=1.15, y=0.25, len=0.4)
        )
    )
    fig.add_trace(coint_bar, row=2, col=2)

# Add a horizontal line at .05 for cointegration
fig.add_hline(y=-np.log10(0.05), line_dash='dash', line_color='red', row=2, col=2)

# Create dropdown menu buttons
buttons = []
for key in etf_dataframes_correlation_matrices.keys():
    matrix = etf_dataframes_correlation_matrices[key]
    df = etf_dataframes_monthly[key]
    pair_names, pair_corrs = get_sorted_correlations(matrix)
    pair_names_list = list(pair_names)  # Convert to list for reuse
    coint_pair_names, coint_p_values = get_cointegration_pvals(df, pair_names_list)
    
    log_p_values = [-np.log10(p) if p > 0 else 15 for p in coint_p_values] if coint_p_values else []
    
    buttons.append(
        dict(
            method='update',
            label=key,
            args=[{
                'z': [matrix.values, None, None],
                'x': [matrix.columns, pair_names_list, coint_pair_names],
                'y': [matrix.index, pair_corrs, log_p_values],
                'marker.color': [None, pair_corrs, log_p_values]
            }]
        )
    )

# Update layout
fig.update_layout(
    title='ETF Analysis: Monthly Correlation and Cointegration',
    updatemenus=[{
        'buttons': buttons,
        'direction': 'down',
        'showactive': True,
        'x': 0.1,
        'y': 1.15,
        'xanchor': 'left',
        'yanchor': 'top'
    }],
    height=900,
)

# Format x-axes
fig.update_xaxes(tickangle=90, tickfont=dict(size=8), row=1, col=2)
fig.update_xaxes(tickangle=90, tickfont=dict(size=8), row=2, col=2)

# Add axis titles
fig.update_yaxes(title='Correlation', row=1, col=2)
fig.update_yaxes(title='-log10(p-value)', row=2, col=2)

# Show the figure
fig.show()




In [None]:
def plot_prices_and_returns(df_dict, n=200):
    """
    Plots, for each group of assets in a dictionary of DataFrames:
      - The prices in the first subplot,
      - The n-window returns in the second subplot,
      - The n-window Sharpe ratio in the third subplot.

    Provides a single dropdown to toggle which group to display.

    Args:
        df_dict (dict): A dictionary where each key is a group name and each value is a
                        pandas DataFrame with a DateTime index (prices) and columns as asset tickers.
        n (int): The window length for computing returns (periods=n in pct_change).

    Returns:
        None
    """
    import numpy as np
    import pandas as pd
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    # Create a 3-row figure:
    # 1) first row for prices,
    # 2) second row for n-window returns,
    # 3) third row for n-window Sharpe ratio.
    fig = make_subplots(
        rows=3, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.06,
        subplot_titles=[
            "Prices",
            f"{n}-Window Returns",
            f"{n}-Window Sharpe Ratio"
        ]
    )

    group_names = list(df_dict.keys())
    total_traces = 0
    group_traces_visibility = []  # Will store (start_idx, end_idx) for each group

    # For each group, add 3 traces per asset:
    #   1) Price trace
    #   2) Returns trace
    #   3) Sharpe ratio trace
    for i, (group_name, df) in enumerate(df_dict.items()):
        # Calculate n-window returns
        df_returns = df.pct_change(periods=n)

        # Calculate approximate n-window Sharpe (no RF, daily frequency assumed)
        #  rolling_mean: average returns over the window
        #  volatility: std of all returns over the window
        #  ratio = sqrt(n) * rolling_mean / volatility
        rolling_mean = df_returns.rolling(window=n).mean()
        volatility = df_returns.rolling(window=n).std()
        sharpe_ratio = (rolling_mean * np.sqrt(n)) / volatility

        start_idx = total_traces
        for col in df.columns:
            # 1) Price trace (row=1)
            fig.add_trace(
                go.Scatter(
                    x=df.index,
                    y=df[col],
                    mode='lines',
                    name=f"{col} ({group_name}) - Price",
                    visible=(True if i == 0 else False)
                ),
                row=1, col=1
            )
            total_traces += 1

            # 2) Returns trace (row=2)
            fig.add_trace(
                go.Scatter(
                    x=df_returns.index,
                    y=df_returns[col],
                    mode='lines',
                    name=f"{col} ({group_name}) - {n}-Win Return",
                    visible=(True if i == 0 else False)
                ),
                row=2, col=1
            )
            total_traces += 1

            # 3) Sharpe ratio trace (row=3)
            fig.add_trace(
                go.Scatter(
                    x=sharpe_ratio.index,
                    y=sharpe_ratio[col],
                    mode='lines',
                    name=f"{col} ({group_name}) - Sharpe",
                    visible=(True if i == 0 else False)
                ),
                row=3, col=1
            )
            total_traces += 1

        end_idx = total_traces - 1
        group_traces_visibility.append((start_idx, end_idx))

    # Build dropdown buttons to toggle each group's traces
    buttons = []
    for i, group_name in enumerate(group_names):
        visible_config = [False] * total_traces

        start_idx, end_idx = group_traces_visibility[i]
        # Make only this group's traces visible
        for j in range(start_idx, end_idx + 1):
            visible_config[j] = True

        buttons.append({
            "label": group_name,
            "method": "update",
            "args": [{"visible": visible_config}],
        })

    # Add the dropdown menu & layout options
    fig.update_layout(
        updatemenus=[
            {
                "buttons": buttons,
                "direction": "down",
                "showactive": True,
            }
        ],
        title="Prices, Returns & Sharpe by Asset Group",
        template="plotly_dark",
        height=2400
    )

    # Label axes
    fig.update_xaxes(title_text="Date", row=3, col=1)
    fig.update_yaxes(title_text="Price", row=1, col=1)
    fig.update_yaxes(title_text="Returns", row=2, col=1)
    fig.update_yaxes(title_text="Sharpe", row=3, col=1)

    # Add a horizontal line at y=0 for returns subplot (row=2)
    fig.add_shape(
        type="line",
        xref="paper", x0=0, x1=1,
        yref="y2", y0=0, y1=0,
        line=dict(color="white", dash="dash")
    )    # Add a horizontal line at y=0 for returns subplot (row=1)
    
    fig.add_shape(
        type="line",
        xref="paper", x0=0, x1=1,
        yref="y3", y0=0, y1=0,
        line=dict(color="white", dash="dash")
    )
    # Show the figure
    fig.show()

plot_prices_and_returns(etf_dataframes)


In [None]:
def plot_diff_from_average(df_dict, n=200):
    """
    Plots, for each group of assets in a dictionary of DataFrames:
      - The difference between each asset's n-window returns and the average returns in the first subplot,
      - The difference between each asset's n-window Sharpe ratio and the average Sharpe ratio in the second subplot.
    (Horizontal lines removed as requested.)

    Args:
        df_dict (dict): Dictionary of group names and DataFrames (with DateTime index and asset columns)
        n (int): Window length for computing returns (pct_change(periods=n)) and metrics.

    Returns:
        None
    """
    import numpy as np
    import pandas as pd
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.06,
        subplot_titles=[
            f"{n}-Window Returns Difference (Asset - Average)",
            f"{n}-Window Sharpe Ratio Difference (Asset - Average)"
        ]
    )

    group_names = list(df_dict.keys())
    total_traces = 0
    group_traces_visibility = []

    all_returns_diff = []
    all_sharpe_diff = []

    for i, (group_name, df) in enumerate(df_dict.items()):
        df_returns = df.pct_change(periods=n)
        rolling_mean = df_returns.rolling(window=n).mean()
        volatility = df_returns.rolling(window=n).std()
        sharpe_ratio = (rolling_mean * np.sqrt(n)) / (volatility)

        avg_returns = df_returns.mean(axis=1)
        avg_sharpe = sharpe_ratio.mean(axis=1)

        start_idx = total_traces
        for col in df.columns:
            diff_returns = df_returns[col] - avg_returns
            diff_sharpe = sharpe_ratio[col] - avg_sharpe

            all_returns_diff.append(diff_returns)
            all_sharpe_diff.append(diff_sharpe)

            fig.add_trace(
                go.Scatter(
                    x=df.index,
                    y=diff_returns,
                    mode='lines',
                    name=f"{col} ({group_name}) - {n}-Win Return Diff",
                    visible=(True if i == 0 else False)
                ),
                row=1, col=1
            )
            total_traces += 1

            fig.add_trace(
                go.Scatter(
                    x=df.index,
                    y=diff_sharpe,
                    mode='lines',
                    name=f"{col} ({group_name}) - Sharpe Diff",
                    visible=(True if i == 0 else False)
                ),
                row=2, col=1
            )
            total_traces += 1

        group_traces_visibility.append((start_idx, total_traces - 1))

    buttons = []
    for i, group_name in enumerate(group_names):
        visible_config = [False] * total_traces
        start_idx, end_idx = group_traces_visibility[i]
        for j in range(start_idx, end_idx + 1):
            visible_config[j] = True
        buttons.append({
            "label": group_name,
            "method": "update",
            "args": [{"visible": visible_config}],
        })

    all_returns_diff = pd.concat(all_returns_diff).dropna()
    all_sharpe_diff = pd.concat(all_sharpe_diff).dropna()
    #add horizontal line at y=0
    fig.add_shape(
        type="line",
        xref="paper", x0=0, x1=1,
        yref="y1", y0=0, y1=0,
        line=dict(color="white", dash="dash")
    )
    
    fig.add_shape(    
        type="line",
        xref="paper", x0=0, x1=1,
        yref="y2", y0=0, y1=0,
        line=dict(color="white", dash="dash")
    )
    
    fig.update_layout(
        updatemenus=[{
            "buttons": buttons,
            "direction": "down",
            "showactive": True,
        }],
        title="Differences from Average by Asset Group",
        template="plotly_dark",
        height=1600
    )

    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Returns Diff", row=1, col=1)
    fig.update_yaxes(title_text="Sharpe Diff", row=2, col=1)

    fig.show()
plot_diff_from_average(etf_dataframes)

In [None]:
'''
#plot absolute and relative sortino ratios on a table
import plotly.graph_objects as go

def plot_combined_table(df, title='Combined Sortino Indicators'):
    """
    Plots a combined table with indicators for Sortino differences and integer-based standard deviation categories,
    highlighting the direction of deviations (positive or negative).
    
    Parameters:
        df (pd.DataFrame): Combined DataFrame with 'Ticker', Sortino differences, and integer SD categories.
        title (str): Title of the table.
    """
    # Define color mappings for integer-based deviation categories
    deviation_colors = {
        3: 'lightgreen',   # z > 3
        2: 'yellow',       # 2 < z <= 3
        1: 'orange',       # 1 < z <= 2
        0: 'white',        # -1 < z <= 1
        -1: 'lightblue',   # -2 < z <= -1
        -2: 'coral',       # -3 < z <= -2
        -3: 'lightcoral'   # z <= -3
    }
    
    # Initialize fill colors based on deviation categories and Sortino differences
    fill_colors = []
    for _, row in df.iterrows():
        row_colors = []
        for col in df.columns:
            # Color for Ticker column
            if col == 'Ticker':
                row_colors.append('lightgrey')
            
            # Color for "Relative performance" boolean columns
            elif 'Relative performance' in col:
                if row[col]:
                    row_colors.append('lightgreen')  # Underperforming
                else:
                    row_colors.append('white')       # Overperforming
            
            # Color for integer-based SD deviation columns
            else:
                # Fetch the integer deviation value
                deviation = row[col]
                # Get the color from the dictionary, default to white if not found
                color = deviation_colors.get(deviation, 'white')
                row_colors.append(color)
        fill_colors.append(row_colors)
    
    # Transpose fill_colors to match Plotly's column-wise format
    fill_colors_transposed = list(map(list, zip(*fill_colors)))
    
    # Replace boolean values with descriptive text for Sortino differences
    display_df = df.copy()
    for col in display_df.columns:
        if 'Relative performance' in col:
            display_df[col] = display_df[col].apply(lambda x: 'Underperforming' if x else 'Overperforming')
    
    # Create the Plotly table
    fig = go.Figure(data=[go.Table(
        header=dict(
            values=['<b>' + col.replace('_', ' ') + '</b>' for col in display_df.columns],
            fill_color='paleturquoise',
            align='center',
            font=dict(color='black', size=12)
        ),
        cells=dict(
            values=[display_df[col] for col in display_df.columns],
            fill_color=fill_colors_transposed,
            align='center',
            font=dict(color='black', size=11)
        )
    )])
    
    # Update layout for aesthetics
    fig.update_layout(
        title=title,
        template='plotly_white',
        height=800,
        margin=dict(l=50, r=50, t=80, b=200)  # Increased bottom margin for legend
    )
    
    # Add an updated legend using annotations
    legend_text = (
        "<b>Legend (Integer SD Buckets):</b><br>"
        "3 => z > 3<br>"
        "2 => 2 < z <= 3<br>"
        "1 => 1 < z <= 2<br>"
        "0 => -1 < z <= 1<br>"
        "-1 => -2 < z <= -1<br>"
        "-2 => -3 < z <= -2<br>"
        "-3 => z <= -3<br><br>"
        "<b>Performance Indicators:</b><br>"
        "Light Green: Underperforming<br>"
        "White: Overperforming"
    )
    
    fig.add_annotation(
        text=legend_text,
        showarrow=False,
        xref="paper", yref="paper",
        x=0.5, y=-0.3,
        xanchor='center',
        yanchor='top',
        font=dict(color='black', size=12)
    )
    
    fig.show()

#Compute absolute and relative sortino ratios on a table for etfs
#-----------------------------------------------------------


latest_sortino_differences_indicators = create_sortino_negative_indicators(
    rolling_sortino_ratios_etf_50,
    rolling_sortino_ratios_etf_200
)


sortino_std_deviation_table = create_sortino_std_deviation_table(
    rolling_sortino_ratios_etf_50,
    rolling_sortino_ratios_etf_200
)


price_std_deviation_table = create_price_std_deviation_table(etf_prices, window_sizes=[50, 200])


combined_df = latest_sortino_differences_indicators.merge(
    sortino_std_deviation_table,
    on='Ticker',
    how='left'
).merge(
    price_std_deviation_table,
    on='Ticker',
    how='left'
)

combined_df = combined_df.sort_values(
    by=['Std Dev Direction for 200_Day Sortino Ratio']
).reset_index(drop=True)

plot_combined_table(
    combined_df,
    title='Combined Sortino Indicators'
)

#-----------------------------------------------------------

'''

In [None]:
'''
z_score_50 = pd.DataFrame()
z_score_200 = pd.DataFrame()

z_score_sortino_ratio_50       = calculate_z_scores(rolling_sortino_ratios_etf_50)
z_score_benchmark_minus_etf_50 = calculate_z_scores(rolling_sortino_ratios_benchmark_minus_etf_50)

z_score_50['50 day Sortino Ratio (z score)'] = z_score_sortino_ratio_50
z_score_50['50 day Benchmark Minus ETF Sortino Ratio (z score)'] = z_score_benchmark_minus_etf_50
z_score_50.sort_values(by='50 day Sortino Ratio (z score)', ascending=True, inplace=True)


z_score_200['200 day Sortino Ratio (z score)'] = calculate_z_scores(rolling_sortino_ratios_etf_200)
z_score_200['200 day Benchmark Minus ETF Sortino Ratio (z score)'] = calculate_z_scores(rolling_sortino_ratios_benchmark_minus_etf_200)
z_score_200.sort_values(by='200 day Sortino Ratio (z score)', ascending=True, inplace=True)

#plot 50 day z scores, truncate decimals to 2
z_score_50 = z_score_50.round(2)
z_score_200 = z_score_200.round(2)


# Create figure for 50-day z-scores with color coding using 0.5 threshold
fig1 = go.Figure(data=[go.Table(
    header=dict(
        values=['Ticker', '50 day Sortino Ratio (z score)', '50 day Benchmark Minus ETF Sortino Ratio (z score)'],
        fill_color='paleturquoise',
        align='center',
        font=dict(size=12)
    ),
    cells=dict(
        values=[z_score_50.index, z_score_50['50 day Sortino Ratio (z score)'], 
                z_score_50['50 day Benchmark Minus ETF Sortino Ratio (z score)']],
        fill_color=[
            'lightgrey',
            z_score_50['50 day Sortino Ratio (z score)'].apply(
                lambda x: 'lightgreen' if x > 0.5 else ('lightcoral' if x < -0.5 else 'white')),
            z_score_50['50 day Benchmark Minus ETF Sortino Ratio (z score)'].apply(
                lambda x: 'lightgreen' if x > 0.5 else ('lightcoral' if x < -0.5 else 'white'))
        ],
        align='center')
)])

fig1.update_layout(
    title='50 Day Z-Scores for Sortino Ratios',
    height=600, 
    margin=dict(l=10, r=10, t=50, b=10)
)
fig1.show()

# Create figure for 200-day z-scores with color coding using 0.5 threshold
fig2 = go.Figure(data=[go.Table(
    header=dict(
        values=['Ticker', '200 day Sortino Ratio (z score)', '200 day Benchmark Minus ETF Sortino Ratio (z score)'],
        fill_color='paleturquoise',
        align='center',
        font=dict(size=12)
    ),
    cells=dict(
        values=[z_score_200.index, z_score_200['200 day Sortino Ratio (z score)'], 
                z_score_200['200 day Benchmark Minus ETF Sortino Ratio (z score)']],
        fill_color=[
            'lightgrey',
            z_score_200['200 day Sortino Ratio (z score)'].apply(
                lambda x: 'lightgreen' if x > 0.5 else ('lightcoral' if x < -0.5 else 'white')),
            z_score_200['200 day Benchmark Minus ETF Sortino Ratio (z score)'].apply(
                lambda x: 'lightgreen' if x > 0.5 else ('lightcoral' if x < -0.5 else 'white'))
        ],
        align='center')
)])

fig2.update_layout(
    title='200 Day Z-Scores for Sortino Ratios',
    height=600,
    margin=dict(l=10, r=10, t=50, b=10)
)
fig2.show()
'''

In [None]:
z_score_50 = pd.DataFrame()
z_score_200 = pd.DataFrame()

z_score_sortino_ratio_50       = calculate_z_scores(rolling_sortino_ratios_etf_50)
z_score_benchmark_minus_etf_50 = calculate_z_scores(rolling_sortino_ratios_benchmark_minus_etf_50)

z_score_50['50 day Sortino Ratio (z score)'] = z_score_sortino_ratio_50
z_score_50['50 day Benchmark Minus ETF Sortino Ratio (z score)'] = z_score_benchmark_minus_etf_50
z_score_50.sort_values(by='50 day Sortino Ratio (z score)', ascending=True, inplace=True)


z_score_200['200 day Sortino Ratio (z score)'] = calculate_z_scores(rolling_sortino_ratios_etf_200)
z_score_200['200 day Benchmark Minus ETF Sortino Ratio (z score)'] = calculate_z_scores(rolling_sortino_ratios_benchmark_minus_etf_200)
z_score_200.sort_values(by='200 day Sortino Ratio (z score)', ascending=True, inplace=True)

#plot 50 day z scores, truncate decimals to 2
z_score_50 = z_score_50.round(2)
z_score_200 = z_score_200.round(2)

#combine both dataframes
z_score_combined = pd.concat([z_score_50, z_score_200], axis=1)


#sort columns by 200 day z score first, then 50 day z score
z_score_combined = z_score_combined.reindex(sorted(z_score_combined.columns, key=lambda x: (x.split()[0], x.split()[2])), axis=1)
# Create figure for combined z-scores with color coding using 0.5 threshold


#create a fig for z_score_combined
#make sure there is an option to sort by which column i select via a drop down
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
def plot_z_score_combined(z_score_combined):
    # Define the columns for the dropdown options
    columns = z_score_combined.columns.tolist()
    
    # Create the initial figure with the data sorted by the first column
    fig = go.Figure()
    
    # Add the initial table trace (sorted by first column)
    sorted_df = z_score_combined.sort_values(by=columns[0], ascending=True)
    
    # Define a function to determine cell color based on z-score value and column type
    def get_cell_color(value, column_name):
        if pd.isna(value):
            return 'white'
        
        # Check if this is a "Benchmark Minus ETF" column (inverse coloring logic)
        if "Benchmark Minus ETF" in column_name:
            # For benchmark minus ETF columns: green for above 1, red for below -0.5
            if value > 1:
                return 'lightgreen'
            elif value < -0.5:
                return 'lightcoral'
            else:
                return 'white'
        else:
            # For regular sortino columns: red for above 1, green for below -0.5
            if value > 1:
                return 'lightcoral'
            elif value < -0.5:
                return 'lightgreen'
            else:
                return 'white'
    
    table = go.Table(
        header=dict(
            values=['Ticker'] + columns,
            fill_color='paleturquoise',
            align='center',
            font=dict(size=12)
        ),
        cells=dict(
            values=[sorted_df.index] + [sorted_df[col] for col in columns],
            fill_color=[
                'lightgrey',  # Ticker column color
                # For each data column, color cells based on value and column name
                *[[get_cell_color(val, col) for val in sorted_df[col]] for col in columns]
            ],
            align='center',
            format=[None] + ['.2f'] * len(columns)  # Format numbers to 2 decimal places
        )
    )
    
    fig.add_trace(table)
    
    # Create dropdown menu options for sorting
    buttons = []
    
    # Add buttons for each column (ascending only)
    for i, col in enumerate(columns):
        buttons.append(dict(
            args=[{
                'cells': {
                    'values': [z_score_combined.sort_values(by=col, ascending=True).index] + 
                              [z_score_combined.sort_values(by=col, ascending=True)[c] for c in columns],
                    'fill': {
                        'color': [
                            'lightgrey',  # Ticker column color
                            # For each data column, color cells based on value and column name
                            *[[get_cell_color(val, c) for val in z_score_combined.sort_values(by=col, ascending=True)[c]] for c in columns]
                        ]
                    }
                }
            }],
            label=f"{col} (Ascending)",
            method="update"
        ))
    
    # Update layout with dropdown menu
    fig.update_layout(
        title='Combined Z-Scores for Sortino Ratios',
        updatemenus=[{
            'buttons': buttons,
            'direction': 'down',
            'showactive': True,
            'x': 0.1,
            'y': 1.15,
            'xanchor': 'left',
            'yanchor': 'top'
        }],
        template='plotly_white',
        height=600,
        margin=dict(l=10, r=10, t=100, b=10)  # Increased top margin for dropdown
    )
    
    # Add a color legend annotation with updated descriptions
    legend_text = (
        "Color coding for Asset Sortino Ratio:<br>" +
        "<span style='color:lightcoral'>■</span> z > 1: Significantly above average (potential overvaluation)<br>" +
        "<span style='color:lightgreen'>■</span> z < -0.5: Significantly below average (potential undervaluation)<br>" +
        "<br>Color coding for Benchmark Minus ETF:<br>" +
        "<span style='color:lightgreen'>■</span> z > 1: ETF underperforming benchmark (potential buying opportunity)<br>" +
        "<span style='color:lightcoral'>■</span> z < -0.5: ETF outperforming benchmark (potentially overvalued)<br>" 
    )
    
    fig.add_annotation(
        text=legend_text,
        showarrow=False,
        xref="paper", yref="paper",
        x=1.0, y=1.2,
        xanchor='right',
        yanchor='top',
        font=dict(size=10),
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="black",
        borderwidth=1
    )
    
    return fig

plot_z_score_combined(z_score_combined)
