In [None]:
#Notebook description:

# this notebook is used to evaluate the market of assets and find potential assets to invest in
# with the best risk/reward characteristics. This notebook is intented to analyze a broad group of market assets
# and does NOT focus on any particular assets. Computing data for a large number of assets is computationally expensive and thus
# the analysis is relegated to other notebooks


In [None]:
# TODOS:

# 1. Create a table that visualizes every company for each subsector where the columns are subindustries and the rows are companies sorted by market cap



In [None]:
#Load libraries
import logging
logger = logging.getLogger('yfinance')
logger.disabled = True
logger.propagate = False
# Load libraries

import sys
sys.path.append(r"e:\Coding Projects\Investment Analysis")
from Quantapp.Plotter import Plotter
from Quantapp.Computation import Computation, Helper
from Quantapp.EconomicData import EconomicData

import numpy as np
import json
import os
import pandas as pd
import yfinance as yf
from statsmodels.tsa.stattools import coint
from IPython.display import display
from concurrent.futures import ThreadPoolExecutor
from plotly.subplots import make_subplots
from datetime import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
import plotly.graph_objects as go
import pandas as pd
import holidays
import plotly.express as px
import concurrent.futures

#shut down warnings
import warnings
warnings.filterwarnings("ignore")

time_frame_week = 7
time_frame_short = 21
time_frame_mid   = 50
time_frame_long = 365
interval = '1d'
period     = '10y'

risk_free_rate = 0.02 / 252  # Annualized risk-free rate divided by trading days
benchmark = 'SPY'

qc = Computation()
qp = Plotter()
qe = EconomicData()
helper = Helper()

                

In [None]:
#Load custom functions

def create_sortino_negative_indicators(sortino_diff_50, sortino_diff_200):
    """
    Creates a DataFrame indicating whether the latest Sortino ratio difference for each ticker is less than zero
    for 21-day, 50-day, and 200-day rolling windows.

    Parameters:
        sortino_diff_21 (pd.DataFrame): DataFrame of 21-day Sortino differences.
        sortino_diff_50 (pd.DataFrame): DataFrame of 50-day Sortino differences.
        sortino_diff_200 (pd.DataFrame): DataFrame of 200-day Sortino differences.

    Returns:
        pd.DataFrame: DataFrame with tickers as rows and columns ['21_Day', '50_Day', '200_Day'],
                      with True indicating the latest Sortino difference is <0, and False otherwise.
    """
    # Extract the latest row from each Sortino difference DataFrame
    latest_50 = sortino_diff_50.iloc[-1]
    latest_200 = sortino_diff_200.iloc[-1]

    # Create a new DataFrame with indicators
    indicators_df = pd.DataFrame({
        '50_Day': latest_50 > 0,
        '200_Day': latest_200 > 0
    })

    # Reset index to have tickers as a column
    indicators_df = indicators_df.reset_index()
    indicators_df.columns = ['Ticker',  'Relative performance: 50 Day Sortino (Benchmark - asset)', 'Relative performance: 200 Day Sortino (Benchmark - asset)']

    return indicators_df

def create_sortino_std_deviation_table(rolling_sortino_ratios_50, rolling_sortino_ratios_200):
    """
    Creates a DataFrame indicating how many standard deviations the latest Sortino ratio difference for each ticker is
    above or below its historical mean for 50-day and 200-day rolling windows, using integer buckets instead of strings.

    Integer categories (based on z-score):
      3  => z > 3
      2  => 2 < z <= 3
      1  => 1 < z <= 2
      0  => -1 < z <= 1
     -1  => -2 < z <= -1
     -2  => -3 < z <= -2
     -3  => z <= -3
    """
    windows = {
        '50_Day': rolling_sortino_ratios_50,
        '200_Day': rolling_sortino_ratios_200
    }
    
    # Initialize dictionary to store deviation categories
    deviation_data = {'Ticker': rolling_sortino_ratios_50.columns.tolist()}
    for window in windows.keys():
        deviation_data[f'Std Dev Direction for {window} Sortino Ratio'] = []
    
    for window, df in windows.items():
        mean = df.mean()
        std = df.std()
        latest_values = df.iloc[-1]
        
        # Calculate z-scores
        z_scores = (latest_values - mean) / std
        
        # Assign integer categories based on z-scores
        def categorize_z(z):
            if z > 3:
                return 3
            elif z > 2:
                return 2
            elif z > 1:
                return 1
            elif z > -1:  # covers -1 < z <= 1
                return 0
            elif z > -2:
                return -1
            elif z > -3:
                return -2
            else:
                return -3
        
        categories = z_scores.apply(categorize_z)
        deviation_data[f'Std Dev Direction for {window} Sortino Ratio'] = categories.tolist()
    
    # Create the deviation table DataFrame
    deviation_table = pd.DataFrame(deviation_data)
    
    return deviation_table

def create_price_std_deviation_table(price_data, window_sizes=[21, 50, 200]):
    """
    Creates a DataFrame indicating how many standard deviations the latest price is
    above or below its rolling mean over specified window sizes, using integer buckets.

    Integer categories (based on z-score):
      3  => z > 3
      2  => 2 < z <= 3
      1  => 1 < z <= 2
      0  => -1 < z <= 1
     -1  => -2 < z <= -1
     -2  => -3 < z <= -2
     -3  => z <= -3
    """
    # Initialize dictionary to store deviation categories
    deviation_data = {'Ticker': price_data.columns.tolist()}

    for window in window_sizes:
        categories = []
        for ticker in price_data.columns:
            ticker_prices = price_data[ticker].dropna()
            if len(ticker_prices) >= window:
                rolling_mean = ticker_prices.rolling(window=window).mean()
                rolling_std = ticker_prices.rolling(window=window).std()

                latest_price = ticker_prices.iloc[-1]
                latest_mean = rolling_mean.iloc[-1]
                latest_std = rolling_std.iloc[-1]

                if latest_std == 0 or pd.isna(latest_std):
                    category = 'Insufficient Data'
                else:
                    z_score = (latest_price - latest_mean) / latest_std

                    def categorize_z(z):
                        if z > 3:
                            return 3
                        elif z > 2:
                            return 2
                        elif z > 1:
                            return 1
                        elif z > -1:
                            return 0
                        elif z > -2:
                            return -1
                        elif z > -3:
                            return -2
                        else:
                            return -3

                    category = categorize_z(z_score)
            else:
                category = 'Insufficient Data'
            categories.append(category)
        deviation_data[f'Std Dev Direction for {window}_Day Price'] = categories

    deviation_table = pd.DataFrame(deviation_data)
    return deviation_table

def plot_combined_table(df, title='Combined Sortino Indicators'):
    """
    Plots a combined table with indicators for Sortino differences and standard deviation categories,
    highlighting the direction of deviations (positive or negative).
    
    Parameters:
        df (pd.DataFrame): Combined DataFrame with 'Ticker', Sortino differences, and deviation categories.
        title (str): Title of the table.
    """
    # Define color mappings for positive and negative deviations
    positive_deviation_colors = {
        '>+3 SD': 'lightgreen',
        '+2-3 SD': 'yellow',
        '+1-2 SD': 'orange',
        '+<1 SD': 'white'
    }
    
    negative_deviation_colors = {
        '<-3 SD': 'lightcoral',
        '-2-3 SD': 'coral',
        '-1-2 SD': 'lightblue',
        '-<1 SD': 'white'
    }
    
    # Initialize fill colors based on deviation categories and Sortino differences
    fill_colors = []
    for _, row in df.iterrows():
        row_colors = []
        for col in df.columns:
            if col == 'Ticker':
                row_colors.append('lightgrey')  # Default color for Ticker column
            elif 'Relative performance' in col:
                if row[col]:  # Underperforming if True
                    row_colors.append('lightgreen')  # Highlight underperforming assets
                else:
                    row_colors.append('white')        # Default color
            else:
                deviation = row[col]
                if deviation.startswith('+'):
                    # Positive Deviation
                    color = positive_deviation_colors.get(deviation, 'white')
                elif deviation.startswith('-'):
                    # Negative Deviation
                    color = negative_deviation_colors.get(deviation, 'white')
                else:
                    color = 'white'  # Default color for any other case
                row_colors.append(color)
        fill_colors.append(row_colors)
    
    # Transpose fill_colors to match Plotly's column-wise format
    fill_colors_transposed = list(map(list, zip(*fill_colors)))
    
    # Replace boolean values with descriptive text for Sortino differences
    display_df = df.copy()
    for col in df.columns:
        if 'Relative performance' in col:
            display_df[col] = display_df[col].apply(lambda x: 'Underperforming' if x else 'Overperforming')
    
    # Create the Plotly table
    fig = go.Figure(data=[go.Table(
        header=dict(
            values=['<b>' + col.replace('_', ' ') + '</b>' for col in display_df.columns],
            fill_color='paleturquoise',
            align='center',
            font=dict(color='black', size=12)
        ),
        cells=dict(
            values=[display_df[col] for col in display_df.columns],
            fill_color=fill_colors_transposed,
            align='center',
            font=dict(color='black', size=11)
        )
    )])
    
    # Update layout for aesthetics
    fig.update_layout(
        title=title,
        template='plotly_white',
        height=800,
        margin=dict(l=50, r=50, t=80, b=200)  # Increased bottom margin for legend
    )
    
    # Add a comprehensive legend using annotations
    legend_text = (
        "<b>Legend:</b><br>"
        "<b>Deviation Directions:</b><br>"
        "Light Green: >+3 SD (Significantly Above Mean)<br>"
        "Yellow: +2-3 SD (Above Mean)<br>"
        "Orange: +1-2 SD (Slightly Above Mean)<br>"
        "Light Coral: <-3 SD (Significantly Below Mean)<br>"
        "Coral: -2-3 SD (Below Mean)<br>"
        "Light Blue: -1-2 SD (Slightly Below Mean)<br>"
        "White: Within 1 SD<br><br>"
        "<b>Performance Indicators:</b><br>"
        "Light Green: Underperforming<br>"
        "White: Overperforming"
    )
    
    fig.add_annotation(
        text=legend_text,
        showarrow=False,
        xref="paper", yref="paper",
        x=0.5, y=-0.3,
        xanchor='center',
        yanchor='top',
        font=dict(color='black', size=12)
    )
    
    # Show the table
    fig.show()

def compute_rolling_sortino_ratios(df, n, risk_free_rate=0.0):
    """
    Computes the rolling "n" day Sortino ratios for a DataFrame of stock prices.

    Parameters:
        df (pd.DataFrame): DataFrame containing stock prices with ticker symbols as columns.
        n (int): The window size for the rolling calculation.
        risk_free_rate (float): The risk-free rate for the Sortino ratio calculation (default is 0.0).

    Returns:
        pd.DataFrame: DataFrame containing the rolling "n" day Sortino ratios for each ticker symbol.
    """
    # Calculate daily returns
    returns = df.pct_change()

    # Calculate excess returns
    excess_returns = returns - risk_free_rate / 252

    # Calculate downside deviation
    def downside_deviation(x):
        negative_returns = x[x < 0]
        return np.sqrt((negative_returns ** 2).mean())

    rolling_downside_dev = excess_returns.rolling(window=n).apply(downside_deviation, raw=False)

    # Calculate rolling mean of excess returns
    rolling_mean_excess_returns = excess_returns.rolling(window=n).mean()
    
    # Calculate rolling Sortino ratio
    rolling_sortino_ratio = rolling_mean_excess_returns / rolling_downside_dev

    return rolling_sortino_ratio

def compute_rolling_sortino_ratios_benchmark_minus_asset(df,benchmark_ticker, n, risk_free_rate=0.0):
    """
    Computes the rolling "n" day Sortino ratios for a DataFrame of stock prices.

    Parameters:
        df (pd.DataFrame): DataFrame containing stock prices with ticker symbols as columns.
        n (int): The window size for the rolling calculation.
        risk_free_rate (float): The risk-free rate for the Sortino ratio calculation (default is 0.0).

    Returns:
        pd.DataFrame: DataFrame containing the rolling "n" day Sortino ratios for each ticker symbol.
    """
    # Calculate daily returns
    returns = df.pct_change()

    # Calculate excess returns
    excess_returns = returns - risk_free_rate / 252

    # Calculate downside deviation
    def downside_deviation(x):
        negative_returns = x[x < 0]
        return np.sqrt((negative_returns ** 2).mean())

    rolling_downside_dev = excess_returns.rolling(window=n).apply(downside_deviation, raw=False)

    # Calculate rolling mean of excess returns
    rolling_mean_excess_returns = excess_returns.rolling(window=n).mean()
    
    # Calculate rolling Sortino ratio
    rolling_sortino_ratio = rolling_mean_excess_returns / rolling_downside_dev
    
    benchmark_sortino = rolling_sortino_ratio[benchmark_ticker]
    rolling_sortino_ratio = rolling_sortino_ratio.sub(benchmark_sortino, axis=0)
    benchmark_minus_asset = -rolling_sortino_ratio
    rolling_sortino_ratio = benchmark_minus_asset
    #rolling_sortino_ratio.columns = ['Benchmark_Minus_' + col for col in rolling_sortino_ratio.columns]

    return rolling_sortino_ratio


def plot_time_series(all_series, time_frame='1y', title='Time Series Data of Ticker Symbols'):
    """
    Plots the time series data for a DataFrame where each column is a ticker symbol and each row is a price.

    Parameters:
        all_series (pd.DataFrame): DataFrame containing time series data with ticker symbols as columns and prices as rows.
        title (str): Title for the plot (default is 'Time Series Data of Ticker Symbols').
    """
    # Filter out the data based on the specified time frame
    if time_frame == '1y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=1)]
    elif time_frame == '3y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=3)]
    elif time_frame == '5y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=5)]
    elif time_frame == '10y':
        all_series = all_series.loc[all_series.index >= all_series.index[-1] - pd.DateOffset(years=10)]
    else:
        # Error handling for invalid time frame
        print("Error: Invalid time frame")
        return
    
    # Create a Plotly figure
    fig = px.line(all_series, title=title)
    
    # Add a dashed horizontal line at zero
    fig.add_hline(y=0, line_dash='dash', line_color='red')
    
    # Update layout for the figure
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title='Price',
        template='plotly_dark',
        xaxis=dict(
            tickangle=-45,
            showgrid=True,
            zeroline=True  # Add zero line for x-axis
        ),
        yaxis=dict(
            showgrid=True,
            zeroline=True  # Add zero line for y-axis
        )
    )

    return fig
    
def generate_series(tickers, columns=['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], period=period, interval=interval):
    """
    Generate a DataFrame or Series containing the specified columns for the given tickers.

    Parameters:
    - tickers: List of ticker symbols or a single ticker symbol.
    - columns: List of columns to retrieve or a single column to retrieve (default is ['Close']).
    - period: Data period to retrieve (default is '1y').
    - interval: Data interval to retrieve (default is '1d').

    Returns:
    - pd.DataFrame or pd.Series with the specified columns for the given tickers.
    """
    # Ensure tickers and columns are lists
    if isinstance(tickers, str):
        tickers = [tickers]
    if isinstance(columns, str):
        columns = [columns]

    tickers = [ticker.replace('.', '-') for ticker in tickers]
    try:
        df = yf.download(tickers, period=period, interval=interval, progress=False)
    except Exception as e:
        print(f"An error occurred while fetching data: {e}")
        return pd.DataFrame()
    
    # Check if the specified columns exist in the DataFrame
    missing_columns = [col for col in columns if col not in df.columns.get_level_values(0)]
    if missing_columns:
        print(f"Error: The following columns are not available: {missing_columns}")
        print(f"Possible columns are: {df.columns.get_level_values(0).unique().tolist()}")
        return pd.DataFrame()
    
    df = df[columns]
    
    # Handle the case where there is only one ticker and one column
    if len(tickers) == 1 and len(columns) == 1:
        return df[columns[0]].rename(tickers[0].replace('-', '.'))
    
    # Handle the case where there is only one ticker
    if len(tickers) == 1:
        df.columns = [col.replace('-', '.') for col in df.columns]
    else:
        # If only one column is selected, return a DataFrame with tickers as column names
        if len(columns) == 1:
            df = df[columns[0]]
            df.columns = [col.replace('-', '.') for col in df.columns]
        else:
            # Flatten the multi-level columns if multiple tickers are requested
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = pd.MultiIndex.from_tuples([(col[1], col[0]) for col in df.columns.values])
            else:
                df.columns = pd.MultiIndex.from_tuples([(col.split('.')[0], col.split('.')[1]) for col in df.columns])
    
    return df

def filter_assets_by_positive_spread_std(asset_spreads):
    spreads = asset_spreads
    positive_spreads = spreads[spreads >= 0] 
    
    mean = positive_spreads.mean()
    std_dev = positive_spreads.std()

    latest_spread = spreads.iloc[-1]
    threshold = mean + std_dev

    return latest_spread>=threshold

def filter_assets_below_negative_std(asset_spreads):
    if not isinstance(asset_spreads, pd.Series):
        raise TypeError("asset_spreads must be a pandas Series")

    negative_spreads = asset_spreads[asset_spreads < 0]
    if negative_spreads.empty:
        return pd.Series(dtype=bool)  
    
    mean_negative = negative_spreads.mean()
    std_dev_negative = negative_spreads.std()

    threshold_negative = mean_negative - 0.75 * std_dev_negative
    return asset_spreads < threshold_negative

def get_sector_info(ticker):
    try:
        stock = yf.Ticker(ticker)
        sector = stock.info.get('sector', 'N/A')
        sub_industry = stock.info.get('industry', 'N/A')
        return {'Ticker': ticker, 'Sector': sector, 'Sub-Industry': sub_industry}
    except Exception as e:
        #print(f"Error fetching data for {ticker}: {e}")
        return {'Ticker': ticker, 'Sector': 'N/A', 'Sub-Industry': 'N/A'}

def fetch_ticker_info(ticker):
    info = get_sector_info(ticker)
    print(info)
    print(yf.Ticker(ticker).info)
    #market_cap = yf.Ticker(ticker).info.get('marketCap')
    #return info['Sector'], info['Sub-Industry'], market_cap

def remove_weekends_and_holidays(df, country='US'):
    """
    Removes weekend and holiday rows from a DataFrame with a DateTime index.

    Parameters:
        df (pd.DataFrame): DataFrame with DateTime index.
        country (str): Country code for holidays. Default is 'US'.

    Returns:
        pd.DataFrame: DataFrame without weekend and holiday data.
    """
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("DataFrame index must be a DateTimeIndex")

    # Remove weekends
    df_weekdays = df[df.index.dayofweek < 5]

    # Get holidays
    country_holidays = holidays.CountryHoliday(country)

    # Remove holidays
    df_clean = df_weekdays[~df_weekdays.index.normalize().isin(country_holidays)]

    return df_clean

def create_and_concat_spreads(dataframes, benchmark_series, time_frame, mode):
    benchmark_spreads = [
        qc.create_spreads(df, benchmark_series, time_frame=time_frame, mode=mode)
        for df in dataframes
    ]
    combined_spreads = pd.concat(benchmark_spreads, axis=1)
    combined_spreads = combined_spreads.loc[:, ~combined_spreads.columns.duplicated()]
    return combined_spreads


In [None]:
#Set parameters
selected_sector =  'Healthcare'\

benchmark_str = 'SPY'


In [None]:
#load data
import yfinance as yf
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Download benchmark and risk-free rate data
benchmark = yf.download(benchmark_str, period=period, interval=interval, progress=False)
risk_free_rate = yf.download('^IRX', period=period, interval=interval, progress=False)
market_assets = qe.get_market_assets()
sp500_companies = qe.retrieve_market_data()['SP500']

# Generate holdings_df based on selected_sector
if selected_sector == "S&P 500":
    holdings_df = generate_series(market_assets['SPY_HOLDINGS'], columns=['Close'])
elif selected_sector == "NASDAQ 100":
    holdings_df = generate_series(market_assets['QQQ_HOLDINGS'], columns=['Close'])
elif selected_sector == "DOW JONES":
    holdings_df = generate_series(market_assets['DIA_HOLDINGS'], columns=['Close'])
elif selected_sector == "Communications":
    holdings_df = generate_series(market_assets['XLC_HOLDINGS'], columns=['Close'])
elif selected_sector == "Consumer Staples":
    holdings_df = generate_series(market_assets['XLP_HOLDINGS'], columns=['Close'])
elif selected_sector == "Consumer discretionary":
    holdings_df = generate_series(market_assets['XLY_HOLDINGS'], columns=['Close'])
elif selected_sector == "Energy":
    holdings_df = generate_series(market_assets['XLE_HOLDINGS'], columns=['Close'])
elif selected_sector == "Financial":
    holdings_df = generate_series(market_assets['XLF_HOLDINGS'], columns=['Close'])
elif selected_sector == "Healthcare":
    holdings_df = generate_series(market_assets['XLV_HOLDINGS'], columns=['Close'])
elif selected_sector == "Industrial":
    holdings_df = generate_series(market_assets['XLI_HOLDINGS'], columns=['Close'])
elif selected_sector == "Information Technology":
    holdings_df = generate_series(market_assets['XLK_HOLDINGS'], columns=['Close'])
elif selected_sector == "Materials":
    holdings_df = generate_series(market_assets['XLB_HOLDINGS'], columns=['Close'])
elif selected_sector == "Real Estate":
    holdings_df = generate_series(market_assets['XLRE_HOLDINGS'], columns=['Close'])
elif selected_sector == "Utilities":
    holdings_df = generate_series(market_assets['XLU_HOLDINGS'], columns=['Close'])
else:
    holdings_df = None

# Sort by sub-industry
sector_companies = sp500_companies[sp500_companies['Sector'] == selected_sector].sort_values('Sub-Industry')
sector_prices= generate_series(sector_companies['Symbol'], period=period, interval=interval, columns=['Close'])

sector_returns_short = sector_prices.pct_change(time_frame_short)
sector_returns_mid = sector_prices.pct_change(time_frame_mid)
sector_returns_long = sector_prices.pct_change(time_frame_long)


# Define the list of symbols
symbols = sector_companies['Symbol'].tolist()

# Initialize a dictionary to store market caps
market_caps = {}

# Use yfinance's Tickers class to fetch multiple tickers at once
tickers = yf.Tickers(' '.join(symbols))

# Retrieve market cap for each symbol
for symbol in symbols:
    try:
        info = tickers.tickers[symbol].info
        market_caps[symbol] = info.get('marketCap', None)
    except Exception as e:
        print(f"Error retrieving market cap for {symbol}: {e}")
        market_caps[symbol] = None

# Add the Market Cap to the companies DataFrame
sector_companies['Market Cap'] = sector_companies['Symbol'].map(market_caps)

# Optional: Handle missing market caps
missing_market_cap = sector_companies['Market Cap'].isnull().sum()
if missing_market_cap > 0:
    print(f"Warning: {missing_market_cap} companies have missing market cap data.")

#print all the sectors
sectors = sp500_companies['Sector'].unique()


sub_industry_returns = {}
for sub_industry in sector_companies['Sub-Industry'].unique():
    sub_industry_companies = sector_companies[sector_companies['Sub-Industry'] == sub_industry]
    
    # Using yf.Tickers instead of generate_series
    tickers = yf.Tickers(' '.join(sub_industry_companies['Symbol'].tolist()))
    sub_industry_prices = pd.DataFrame({
        ticker: tickers.tickers[ticker].history(period='max')['Close']
        for ticker in sub_industry_companies['Symbol']
    })
    
    sub_industry_returns_short = sub_industry_prices.pct_change(time_frame_short)
    sub_industry_returns_mid = sub_industry_prices.pct_change(time_frame_mid)
    sub_industry_returns_long = sub_industry_prices.pct_change(time_frame_long)
    sub_industry_returns[sub_industry] = {
        'Short': sub_industry_returns_short,
        'Mid': sub_industry_returns_mid,
        'Long': sub_industry_returns_long
    }


In [None]:
#Visualize the companies by subindustry

#sector_companies columns are ['Symbol', 'Sector', 'Sub-Industry', 'Market Cap']

#Create a dictionary where the keys are the subindustries and the values are the companies in that subindustry
sub_industry_companies = {}
for sub_industry in sector_companies['Sub-Industry'].unique():
    sub_industry_companies[sub_industry] = sector_companies[sector_companies['Sub-Industry'] == sub_industry]
    

#sort the companies by market cap
for sub_industry in sub_industry_companies:
    sub_industry_companies[sub_industry] = sub_industry_companies[sub_industry].sort_values('Market Cap', ascending=False)
'''    
#display the sorted companies
for sub_industry in sub_industry_companies:
    display(sub_industry)
    display(sub_industry_companies[sub_industry])
    display('\n')
    '''
#create a table of tickers where the columns are the subindustries and the rows are the companies sorted by market cap
# Create a dictionary to store ticker symbols for each subindustry
table_dict = {}
for sub_industry in sorted(sub_industry_companies.keys()):
    table_dict[sub_industry] = sub_industry_companies[sub_industry]['Symbol'].tolist()

# Find the maximum number of companies in any subindustry
max_companies = max(len(tickers) for tickers in table_dict.values())

# Create a DataFrame with subindustries as columns
table = pd.DataFrame(index=range(max_companies), columns=sorted(table_dict.keys()))

# Populate the DataFrame with ticker symbols
for sub_industry, tickers in table_dict.items():
    table[sub_industry][:len(tickers)] = tickers

# Replace NaN values with empty strings
table = table.fillna('')

# Display the table
fig = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>' + col + '</b>' for col in table.columns],
        fill_color='paleturquoise',
        align='center'
    ),
    cells=dict(
        values=[table[col].tolist() for col in table.columns],
        fill_color='lavender',
        align='center'
    )
)])

fig.update_layout(
    title="Companies by Sub-Industry (Sorted by Market Cap)",
    height=500
)

fig.show()



In [None]:
#Pie Chart for Market Cap Analysis

# Aggregate market caps by Sub-Industry for the second pie chart
agg_market_cap = sector_companies.groupby('Sub-Industry')['Market Cap'].sum().reset_index()

# Create subplots: 1 row, 2 columns
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{'type':'domain'}, {'type':'domain'}]],
    subplot_titles=("Market Cap Distribution", "Aggregate Market Cap by Sub-Industry"),
    horizontal_spacing=0.1
)

# Function to prepare data based on selection
def prepare_pie_data(sub_industry):
    if sub_industry == 'All':
        filtered_df = sector_companies
        title = "Market Cap Distribution: All"
    else:
        filtered_df = sector_companies[sector_companies['Sub-Industry'] == sub_industry]
        title = f"Market Cap Distribution: {sub_industry}"
    
    # Handle empty DataFrame
    if filtered_df.empty:
        labels = ['No Data']
        values = [1]
    else:
        labels = filtered_df['Symbol']
        values = filtered_df['Market Cap']
    
    return labels, values, title

# Initial data for 'All'
initial_labels, initial_values, initial_title = prepare_pie_data('All')

# Add the first pie chart (Market Cap Distribution)
fig.add_trace(
    go.Pie(
        labels=initial_labels,
        values=initial_values,
        name='Market Cap Distribution',
        textinfo='label+percent',
        hoverinfo='label+value+percent'
    ),
    row=1, col=1
)

# Add the second pie chart (Aggregate Market Cap by Sub-Industry)
fig.add_trace(
    go.Pie(
        labels=agg_market_cap['Sub-Industry'],
        values=agg_market_cap['Market Cap'],
        name='Aggregate Market Cap by Sub-Industry',
        textinfo='label+percent',
        hoverinfo='label+value+percent',
        marker=dict(colors=px.colors.qualitative.Pastel)
    ),
    row=1, col=2
)

# Create dropdown buttons for filtering the first pie chart
sub_industries = ['All'] + sorted(sector_companies['Sub-Industry'].dropna().unique().tolist())

buttons = []
for sub_industry in sub_industries:
    labels, values, title = prepare_pie_data(sub_industry)
    
    # Update the first pie chart only (trace index 0)
    button = dict(
        method="restyle",
        label=sub_industry,
        args=[
            {
                "labels": [labels],
                "values": [values],
            },
            [0]  # Trace index 0 corresponds to the first pie chart
        ]
    )
    buttons.append(button)

# Add dropdown menu to the top right corner of the chart
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=buttons,
            x=1,             # Set x to 1 for right alignment
            y=1,             # Set y to 1 for top alignment
            xanchor='right', # Anchor the dropdown to the right
            yanchor='top',   # Anchor the dropdown to the top
            showactive=True,
            direction="down",
            pad={"r": 10, "t": 10},
            bgcolor='rgba(255, 255, 255, 0)', # Transparent background
            bordercolor='rgba(0,0,0,0)'        # Transparent border
        )
    ],
    title_text="Market Cap Analysis",
    height=600
)

# Customize each pie chart
fig.update_traces(textposition='inside', texttemplate='%{label}: %{percent:.1%}',
                  selector=dict(type='pie'))

# Show the figure
fig.show()

In [None]:
# Create a dictionary where the key is the sub-industry and the value is the dataframe of returns
sub_industry_returns = {}
for sub_industry in sector_companies['Sub-Industry'].unique():
    sub_industry_companies = sector_companies[sector_companies['Sub-Industry'] == sub_industry]
 
    
    # Using yf.Tickers to fetch data for the current sub-industry tickers
    tickers = yf.Tickers(' '.join(sub_industry_companies['Symbol'].tolist()))
    sub_industry_prices = pd.DataFrame({
        ticker: tickers.tickers[ticker].history(period='max')['Close']
        for ticker in sub_industry_companies['Symbol']
        if ticker in tickers.tickers
    })
    # Ensure that all tickers have been fetched successfully
    fetched_tickers = sub_industry_prices.columns.tolist()
    missing_tickers = set(sub_industry_companies['Symbol']) - set(fetched_tickers)
    if missing_tickers:
        print(f"Warning: Missing data for tickers: {missing_tickers}")
    
    # Calculate percentage change for different time frames
    returns       = sub_industry_prices.pct_change()
    sub_industry_returns_short = sub_industry_prices.pct_change(time_frame_short)
    sub_industry_returns_mid = sub_industry_prices.pct_change(time_frame_mid)
    sub_industry_returns_long = sub_industry_prices.pct_change(time_frame_long)    
    
    #make sure returns index is datetime, if not convert it
    
    # Ensure returns index is a DatetimeIndex
    if not isinstance(returns.index, pd.DatetimeIndex):
        returns.index = pd.to_datetime(returns.index)
        print('Converted index to datetime')
    # Handle timezone inconsistencies if present
    if returns.index.tz is not None:
        returns.index = returns.index.tz_localize(None)

    
    
    returns = helper.simplify_datetime_index(returns).copy()
   
    # Rolling mean & std for time_frame_short
    rolling_mean_short = returns.rolling(window=time_frame_short).mean()
    rolling_std_short = returns.rolling(window=time_frame_short).std()
    sharpe_ratio_short = rolling_mean_short / rolling_std_short
    sharpe_ratio_short = sharpe_ratio_short.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Rolling mean & std for time_frame_mid
    rolling_mean_mid = returns.rolling(window=time_frame_mid).mean()
    rolling_std_mid = returns.rolling(window=time_frame_mid).std()
    sharpe_ratio_mid = rolling_mean_mid / rolling_std_mid
    sharpe_ratio_mid = sharpe_ratio_mid.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Rolling mean & std for time_frame_long
    rolling_mean_long = returns.rolling(window=time_frame_long).mean()
    rolling_std_long = returns.rolling(window=time_frame_long).std()
    sharpe_ratio_long = rolling_mean_long / rolling_std_long
    sharpe_ratio_long = sharpe_ratio_long.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Calculate simple average for each time frame

    short_average = sub_industry_returns_short.mean(axis=1)
    mid_average = sub_industry_returns_mid.mean(axis=1)
    long_average = sub_industry_returns_long.mean(axis=1)
    # Populate the sub_industry_returns dictionary with averages
  
    
    sub_industry_returns[sub_industry] = {
        'Short': sub_industry_returns_short,
        'Mid': sub_industry_returns_mid,
        'Long': sub_industry_returns_long,
        'Short_average': short_average,
        'Mid_average': mid_average,
        'Long_average': long_average,
        'Sharpe_Ratio_Short': sharpe_ratio_short,
        'Sharpe_Ratio_Mid': sharpe_ratio_mid,
        'Sharpe_Ratio_Long': sharpe_ratio_long,
        'Sharpe_Ratio_Short_average': sharpe_ratio_short.mean(axis=1),
        'Sharpe_Ratio_Mid_average': sharpe_ratio_mid.mean(axis=1),
        'Sharpe_Ratio_Long_average': sharpe_ratio_long.mean(axis=1)
    }
    

# Iterate through sub_industry_returns, get the averages, put them in separate DataFrames, and then plot the data
# Collect each sub-industry’s Short average into one DataFrame
short_avg_df = pd.DataFrame()
mid_avg_df = pd.DataFrame()
long_avg_df = pd.DataFrame()

for sub_industry, metrics in sub_industry_returns.items():
    short_avg_df[sub_industry] = metrics['Short_average']
    mid_avg_df[sub_industry] = metrics['Mid_average']
    long_avg_df[sub_industry] = metrics['Long_average']

# Rename columns to include a clearer label
short_avg_df.columns = [f"{col}" for col in short_avg_df.columns]
mid_avg_df.columns = [f"{col}" for col in mid_avg_df.columns]
long_avg_df.columns = [f"{col}" for col in long_avg_df.columns]






In [None]:
# average returns by sub-industry for the last 10 years
import datetime as dt

# Plot the Long average returns by sub-industry for the last 10 years

# Calculate cutoff date (10 years ago from the most recent date)
most_recent_date = long_avg_df.index[-1]
cutoff_date = most_recent_date - pd.DateOffset(years=10)

# Filter data for the last 10 years and convert index to date for plotting
long_avg_df_10y = long_avg_df[long_avg_df.index >= cutoff_date].copy()
long_avg_df_10y.index = long_avg_df_10y.index.date

# Calculate and display sector average long returns
sector_avg = sector_returns_long.mean(axis=1)


#long_avg_df_10y.index is a Pandas Index object
#sector_avg.index is a DatetimeIndex object
#make sure to convert the Pandas Index object to a DatetimeIndex object
#plot the differences between the sector average and the sub-industry averages
# Calculate the differences between the sector average and sub-industry averages
#TypeError: Cannot join tz-naive with tz-aware DatetimeIndex
long_avg_df_10y.index = pd.to_datetime(long_avg_df_10y.index)
sector_avg.index = pd.to_datetime(sector_avg.index)






# Create figure for sub-industry returns
fig = go.Figure()

# Add a trace for each sub-industry
for sub_industry in long_avg_df_10y.columns:
    fig.add_trace(go.Scatter(
        x=long_avg_df_10y.index,
        y=long_avg_df_10y[sub_industry],
        mode='lines',
        name=sub_industry
    ))

# Add sector average returns trace
fig.add_trace(go.Scatter(
    x=sector_returns_long.index.date,
    y=sector_avg,
    mode='lines',
    name='Sector Average',
    line=dict(width=4, color='firebrick', dash='dot')
))

# Add a horizontal line at 0
fig.add_hline(y=0, line_dash='dash', line_color='red')

# Update layout
fig.update_layout(
    title="200 Day Average Returns by Sub-Industry (Last 10 Years)",
    xaxis_title="Date",
    yaxis_title="Long Average Return",
    legend_title="Sub-Industry",
    height=800,
    template="plotly_white"
)

fig.show()


#find the difference between the sector average and the sub-industry averages
# Calculate the differences between the sector average and sub-industry averages

long_avg_df_10y.index = pd.to_datetime(long_avg_df_10y.index)
sector_avg.index = pd.to_datetime(sector_avg.index)

# Calculate the differences between the sector average and sub-industry averages
long_avg_diff = long_avg_df_10y.subtract(sector_avg, axis=0)
long_avg_diff
    
# Plot the differences between the sector average and the sub-industry averages
fig = go.Figure()

# Add a trace for each sub-industry

for sub_industry in long_avg_diff.columns:
    fig.add_trace(go.Scatter(
        x=long_avg_diff.index,
        y=long_avg_diff[sub_industry],
        mode='lines',
        name=sub_industry
    ))
    
# Add a horizontal line at 0
fig.add_hline(y=0, line_dash='dash', line_color='red')

# Update layout
fig.update_layout(
    title="Difference in 200 Day Average Returns from Sector Average (Last 10 Years)",
    xaxis_title="Date",
    yaxis_title="Difference in Long Average Return",
    legend_title="Sub-Industry",
    height=800,
    template="plotly_white"
)

fig.show()

In [None]:
'''
import plotly.graph_objects as go

# Initialize the figure
fig = go.Figure()

# Dictionary to hold trace indices for each sub-industry
sub_industry_trace_ids = {}

# Counter to keep track of trace indices
current_trace = 0

# Iterate through each sub-industry to add traces
for sub_industry, metrics in sub_industry_returns.items():
    # Extract individual stock long-term returns and simple average
    individual_returns = metrics['Long']
    average = metrics['Long_average']
    
    # List to store trace indices for the current sub-industry
    trace_indices = []
    
    # Add a trace for each individual stock in the sub-industry
    for ticker in individual_returns.columns:
        fig.add_trace(go.Scatter(
            x=individual_returns.index,
            y=individual_returns[ticker],
            mode='lines',
            name=f"{ticker}",
            visible=False  # Initially hide all traces
        ))
        trace_indices.append(current_trace)
        current_trace += 1
    
    # Add a trace for the Average benchmark
    fig.add_trace(go.Scatter(
        x=average.index,
        y=average,
        mode='lines',
        name=f"{sub_industry} - Average Benchmark",
        line=dict(width=3, color='black'),
        visible=False  # Initially hide all traces
    ))
    trace_indices.append(current_trace)
    current_trace += 1
    
    # Add a zero line
    fig.add_trace(go.Scatter(
        x=individual_returns.index,
        y=[0]*len(individual_returns),
        mode='lines',
        name=f"Zero line - {sub_industry}",
        line=dict(width=1, color='gray', dash='dash'),
        visible=False
    ))
    trace_indices.append(current_trace)
    current_trace += 1
    
    # Store the trace indices for the current sub-industry
    sub_industry_trace_ids[sub_industry] = trace_indices

# Create dropdown buttons for each sub-industry
buttons = []
for sub_industry, trace_indices in sub_industry_trace_ids.items():
    # Create a visibility list: True for traces of the current sub-industry, False otherwise
    visibility = [False] * current_trace
    for idx in trace_indices:
        visibility[idx] = True
    
    # Define the button for the current sub-industry
    button = dict(
        label=sub_industry,
        method='update',
        args=[
            {'visible': visibility},
            {'title': f"Long Returns vs. Average Benchmark for {sub_industry}"}
        ]
    )
    buttons.append(button)

# Optionally, add an 'All' button to show all sub-industries simultaneously
buttons.insert(0, dict(
    label='All',
    method='update',
    args=[
        {'visible': [True] * current_trace},
        {'title': "Long Returns vs. Average Benchmarks for All Sub-Industries"}
    ]
))

# Update the layout with the dropdown menu
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=buttons,
            x=1.15,
            y=1,
            xanchor='right',
            yanchor='top'
        )
    ],
    title="Long Returns vs. Average Benchmark",
    xaxis_title="Date",
    yaxis_title="Long Return",
    height=600*2
)

# Display the figure
fig.show()





#----------------------





import plotly.graph_objects as go

# Initialize the figure
fig = go.Figure()

# Dictionary to hold trace indices for each sub-industry
sub_industry_trace_ids = {}

# Counter to keep track of trace indices
current_trace = 0

# Iterate through each sub-industry to add traces
for sub_industry, metrics in sub_industry_returns.items():
    # Extract individual stock long-term returns and simple average
    individual_returns = metrics['Long']
    average = metrics['Long_average']
    
    # List to store trace indices for the current sub-industry
    trace_indices = []
    
    # Add a trace for each individual stock in the sub-industry
    for ticker in individual_returns.columns:
        # Calculate difference between company return and industry average
        difference = individual_returns[ticker] - average
        
        fig.add_trace(go.Scatter(
            x=individual_returns.index,
            y=difference,
            mode='lines',
            name=f"{ticker}",
            visible=False  # Initially hide all traces
        ))
        trace_indices.append(current_trace)
        current_trace += 1
    
    # Add a reference line at zero (industry average benchmark)
    fig.add_trace(go.Scatter(
        x=average.index,
        y=[0] * len(average),
        mode='lines',
        name=f"{sub_industry} - Industry Average",
        line=dict(width=2, color='black', dash='dash'),
        visible=False  # Initially hide all traces
    ))
    trace_indices.append(current_trace)
    current_trace += 1
    
    # Store the trace indices for the current sub-industry
    sub_industry_trace_ids[sub_industry] = trace_indices

# Create dropdown buttons for each sub-industry
buttons = []
for sub_industry, trace_indices in sub_industry_trace_ids.items():
    # Create a visibility list: True for traces of the current sub-industry, False otherwise
    visibility = [False] * current_trace
    for idx in trace_indices:
        visibility[idx] = True
    
    # Define the button for the current sub-industry
    button = dict(
        label=sub_industry,
        method='update',
        args=[
            {'visible': visibility},
            {'title': f"Return Difference from {sub_industry} Average"}
        ]
    )
    buttons.append(button)

# Optionally, add an 'All' button to show all sub-industries simultaneously
buttons.insert(0, dict(
    label='All',
    method='update',
    args=[
        {'visible': [True] * current_trace},
        {'title': "Return Differences from Sub-Industry Averages"}
    ]
))

# Update the layout with the dropdown menu
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=buttons,
            x=1.15,
            y=1,
            xanchor='right',
            yanchor='top'
        )
    ],
    title="Return Difference from Industry Average",
    xaxis_title="Date",
    yaxis_title="Return Difference",
    height=600*2
)

# Display the figure
fig.show()'''

In [None]:
#sharpe ratios for each company in each subindustry
import plotly.graph_objects as go
import numpy as np

# Initialize the figure
fig = go.Figure()

# Dictionary to hold trace indices for each sub-industry
sub_industry_trace_ids = {}

# Counter to keep track of trace indices
current_trace = 0

# Iterate through each sub-industry to add Sharpe ratio traces
for sub_industry, metrics in sub_industry_returns.items():
    individual_sharpes = metrics['Sharpe_Ratio_Long']
    average_sharpe = individual_sharpes.mean(axis=1)
    
    mean_sharpe = average_sharpe.mean()
    std_sharpe = average_sharpe.std()
    
    trace_indices = []
    
    for ticker in individual_sharpes.columns:
        fig.add_trace(go.Scatter(
            x=individual_sharpes.index,
            y=individual_sharpes[ticker],
            mode='lines',
            name=f"{ticker}",
            visible=False,
            legendgroup='stocks'
        ))
        trace_indices.append(current_trace)
        current_trace += 1
    
    fig.add_trace(go.Scatter(
        x=average_sharpe.index,
        y=average_sharpe,
        mode='lines',
        name=f"{sub_industry} - Average Sharpe Ratio",
        line=dict(width=3, color='black'),
        visible=False
    ))
    trace_indices.append(current_trace)
    current_trace += 1
    
    fig.add_trace(go.Scatter(
        x=[individual_sharpes.index[0], individual_sharpes.index[-1]],
        y=[mean_sharpe, mean_sharpe],
        mode='lines',
        name=f"{sub_industry} - Mean: {mean_sharpe:.2f}",
        line=dict(width=2, color='red', dash='dash'),
        visible=False
    ))
    trace_indices.append(current_trace)
    current_trace += 1
    
    for std_multiple in [1, 2, 3]:
        fig.add_trace(go.Scatter(
            x=[individual_sharpes.index[0], individual_sharpes.index[-1]],
            y=[mean_sharpe + std_multiple * std_sharpe, mean_sharpe + std_multiple * std_sharpe],
            mode='lines',
            name=f"+{std_multiple}σ: {mean_sharpe + std_multiple * std_sharpe:.2f}",
            line=dict(width=1, color='green', dash='dot'),
            visible=False
        ))
        trace_indices.append(current_trace)
        current_trace += 1
        
        fig.add_trace(go.Scatter(
            x=[individual_sharpes.index[0], individual_sharpes.index[-1]],
            y=[mean_sharpe - std_multiple * std_sharpe, mean_sharpe - std_multiple * std_sharpe],
            mode='lines',
            name=f"-{std_multiple}σ: {mean_sharpe - std_multiple * std_sharpe:.2f}",
            line=dict(width=1, color='orange' if std_multiple < 3 else 'red', dash='dot'),
            visible=False
        ))
        trace_indices.append(current_trace)
        current_trace += 1
    
    sub_industry_trace_ids[sub_industry] = trace_indices

buttons = []
for sub_industry, trace_indices in sub_industry_trace_ids.items():
    visibility = [False] * current_trace
    for idx in trace_indices:
        visibility[idx] = True
    button = dict(
        label=sub_industry,
        method='update',
        args=[
            {'visible': visibility},
            {'title': f"Long Sharpe Ratios for {sub_industry}"}
        ]
    )
    buttons.append(button)

buttons.insert(0, dict(
    label='All',
    method='update',
    args=[
        {'visible': [True] * current_trace},
        {'title': "Long Sharpe Ratios for All Sub-Industries"}
    ]
))

fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=buttons,
            x=1.15,
            y=1.1,  # Increased from 1 to move the dropdown higher
            xanchor='right',
            yanchor='top'
        )
    ],
    title="Long Sharpe Ratios by Sub-Industry",
    xaxis_title="Date",
    yaxis_title="Sharpe Ratio",
    height=1000,
    template="plotly_white",
    hovermode="x unified",
    legend=dict(
        groupclick="toggleitem"
    )
)

fig.add_shape(
    type="line",
    xref="paper",
    x0=0, x1=1,
    y0=0, y1=0,
    line=dict(color="gray", width=1, dash="dot")
)

fig.show()


In [None]:
#Computations...

#1. return spreads between benchmark and all assets
#2. the sortino ratios for all assets
#3. the spreads between the sortino ratios of all assets and the benchmark

#etf_dataframes to list
#-----------------------------------------------------------

#the spreads between the benchmark and all assets
benchmark_monthly_returns = qc.calculate_returns(benchmark,frequency='monthly')
benchmark_weekly_returns = qc.calculate_returns(benchmark,frequency='weekly')
benchmark_daily_returns = qc.calculate_returns(benchmark,frequency='daily')


benchmark_minus_equities_week = create_and_concat_spreads(
    list(holdings_df.values()), benchmark, time_frame=time_frame_week, mode=mode
)

'''
# Create and concatenate spreads for the short time frame
benchmark_minus_equities_short = create_and_concat_spreads(
    list(holdings_df.values()), benchmark, time_frame=time_frame_short, mode=mode
)

benchmark_minus_equities_mid = create_and_concat_spreads(
    list(holdings_df.values()), benchmark, time_frame=time_frame_mid, mode=mode
)

benchmark_minus_equities_long = create_and_concat_spreads(
    list(holdings_df.values()), benchmark, time_frame=time_frame_long, mode=mode
)


rolling_sortino_ratios_equities_21 = compute_rolling_sortino_ratios(equity_prices, n=21)    
rolling_sortino_ratios_equities_50 = compute_rolling_sortino_ratios(equity_prices, n=50)
rolling_sortino_ratios_equities_200 = compute_rolling_sortino_ratios(equity_prices, n=200)


rolling_sortino_ratios_benchmark_minus_equities_21  = compute_rolling_sortino_ratios_benchmark_minus_asset(equity_prices,'SPY', n=21)
rolling_sortino_ratios_benchmark_minus_equities_50  = compute_rolling_sortino_ratios_benchmark_minus_asset(equity_prices,'SPY', n=50)
rolling_sortino_ratios_benchmark_minus_equities_200  = compute_rolling_sortino_ratios_benchmark_minus_asset(equity_prices,'SPY', n=200)

'''