In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
IDS=['AMT', 'XOM', 'T', 'SO', 'PFE', 'NVDA', 'NFLX', 'JPM', 'JNJ', 'GLD', 'MARK', 'GE', 'COST', 'AMZN', 'TSLA', 'AAPL']

This is the preliminary data analysis to create modelling assumptions, engineer new features, and find out how we can create a target to generate trading signals.

In [None]:
data=pd.read_csv('../../assets/data.csv', index_col=[0])
COLS=['DATE','CLOSE','HIGH','VOLUME','VOLATILITY_90D'] # I am only interested in the raw features

In [None]:
ID_DATA: dict[str, pd.DataFrame] = {i:data.loc[data.ID==i][COLS] for i in data.ID.unique()}

Peeking at the data, we see that the features are only available at the end of the trading day; the `HIGH` is only known when you collect all price information, `CLOSE` is the last observation of the day. Similarly, `VOLATILITY_90D` is a rolling calculation of returns so that information is also only known at the end of the trading day.

So **Assumption #1** will be that all the trading decisions we make can only be done at the end of the trading day because the raw features we have can only be observed at (daily) market close. The same goes for any engineered features because they are based on the raw features for any calculation. So any trades done on intra-day information will be look-ahead bias.

In [None]:
ID_DATA['AAPL']

In [None]:
tickers = {
    "Inflation_Expectation": "TIP",  # iShares TIPS Bond ETF as a proxy for inflation expectations
    "Unemployment_Proxy": "SIVR",  # Aberdeen Standard Physical Silver Shares ETF (sometimes used as economic health indicator)
    "US_Economy": "SPY",  # SPDR S&P 500 ETF as a proxy for overall US economic health
    "Govt_Debt_Proxy": "TLT",  # iShares 20+ Year Treasury Bond ETF as a proxy for government debt
    "Treasury_10Y": "^TNX",  # 10-Year Treasury Yield
    "Treasury_5Y": "^FVX",  # 5-Year Treasury Yield
    "Treasury_2Y": "^IRX",  # 2-Year Treasury Yield
    "US_Dollar": "DX-Y.NYB",  # US Dollar Index
    "Gold": "GC=F",  # Gold Futures
    "Oil": "CL=F",  # Crude Oil Futures
    "VIX": "^VIX",  # CBOE Volatility Index
    "Real_Estate": "IYR",  # iShares U.S. Real Estate ETF
    "Consumer_Sentiment": "XLY"  # Consumer Discretionary Select Sector SPDR Fund
}

In [None]:
groups = []
for indicator, ticker in tickers.items():
    try:
        data = yf.download(ticker, start="2010-01-01", end="2024-08-20")
        if not data.empty:
            data.insert(0, "ID", indicator)
            data=data.drop(['Volume', 'Adj Close'],axis=1)
            groups.append(data)
        else:
            print(f"No data available for {indicator} ({ticker})")
    except Exception as e:
        print(f"Error downloading data for {indicator} ({ticker}): {str(e)}")

macro_data = pd.concat(groups)

In [None]:
df_pivot = macro_data.loc[macro_data.ID.isin(['Treasury_10Y','Treasury_2Y','Treasury_5Y'])].pivot(columns='ID', values='Close')
feature1='Treasury_10Y'
feature2='Treasury_2Y'
feature3='Treasury_5Y'

plt.figure(figsize=(12, 6))
plt.plot(df_pivot.index, df_pivot[feature1], label=feature1)
plt.plot(df_pivot.index, df_pivot[feature2], label=feature2)
plt.plot(df_pivot.index, df_pivot[feature3], label=feature3)

plt.title(f"Yields Over Time")
plt.xlabel("Date")
plt.ylabel("Treasury Bond Yield (%)")
plt.legend(loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
ID_DATA["MACRO"]=macro_data

In [None]:
market_index = "^GSPC"  # S&P 500
sector_etfs = {
    "Technology": "XLK",
    "Financial": "XLF",
    "Healthcare": "XLV",
    "Consumer_Discretionary": "XLY",
    "Consumer_Staples": "XLP",
    "Energy": "XLE",
    "Utilities": "XLU",
    "Materials": "XLB",
    "Industrial": "XLI",
    "Real_Estate": "XLRE",
    "Communication_Services": "XLC"
}

In [None]:
def download_and_calculate_returns(tickers, start_date, end_date):
    data = yf.download(list(tickers.values()) + [market_index], start=start_date, end=end_date)['Adj Close']
    returns = data.pct_change().dropna()
    return returns

def calculate_betas(returns, market_index):
    betas = {}
    market_returns = returns[market_index]
    
    for sector, ticker in sector_etfs.items():
        sector_returns = returns[ticker]
        beta, _, _, _, _ = stats.linregress(market_returns, sector_returns)
        betas[sector] = beta
    
    return pd.Series(betas)

In [None]:
# Set date range
start_date = "2010-01-01"
end_date = "2024-08-20"

# Download data and calculate returns
returns = download_and_calculate_returns(sector_etfs, start_date, end_date)

# Calculate betas
sector_betas = calculate_betas(returns, market_index)

# Display sector betas
print("Sector Betas:")
print(sector_betas)

# Calculate rolling betas (e.g., 1-year rolling window)
window = 252//2  # Approximately 1 trading year
rolling_betas = pd.DataFrame(index=returns.index, columns=sector_etfs.keys())

for sector, ticker in sector_etfs.items():
    rolling_beta = returns[ticker].rolling(window=window).cov(returns[market_index]) / returns[market_index].rolling(window=window).var()
    rolling_betas[sector] = rolling_beta

plt.figure(figsize=(12, 6))
for sector in sector_etfs.keys():
    plt.plot(rolling_betas.index, rolling_betas[sector], label=sector)

plt.title("Rolling Sector Betas (6-month Window)")
plt.xlabel("Date")
plt.ylabel("Beta")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
ID_DATA["STATIC_BETA"]=sector_betas
ID_DATA["BETA_6M"]=rolling_betas

In [None]:
ID_DATA.keys()

In [None]:
start='2020-01-01'
end='2024-05-03'

for k,v in ID_DATA.items():
    print(k)
    
    if type(v) == pd.Series:
        continue

    print(v.columns.to_list())
    
    if 'DATE' in v.columns.to_list():
        v.DATE=pd.to_datetime(v.DATE)
        ID_DATA[k]=v.loc[(v.DATE >= start) & (v.DATE <= end)]

    elif 'Date' in v.columns.to_list():
        v.Date=pd.to_datetime(v.Date)
        ID_DATA[k]=v.loc[(v.Date >= start) & (v.Date <= end)]

    else:
        print('no date col found')

In [None]:
ID_DATA['MACRO'].reset_index(inplace=True)

In [None]:
ID_DATA['BETA_6M'].reset_index(inplace=True)

In [None]:
static_beta=ID_DATA['STATIC_BETA']
del ID_DATA['STATIC_BETA']

In [None]:
d=ID_DATA['MACRO']

for i in d.ID.unique():
    temp=d.loc[d.ID==i].drop(['ID'],axis=1)
    ID_DATA[i.upper()]=temp

In [None]:
del ID_DATA['MACRO']

In [None]:
import pandas as pd

def create_master_dataframe(data_dict):
    # Define the expected columns
    expected_columns = ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOLUME', 'VOLATILITY_90D']
    
    # Create a list to store data for each row
    rows = []
    
    for id, df in data_dict.items():
        # Standardize column names
        df.columns = df.columns.str.upper()
        df = df.rename(columns={
            'DATE': 'DATE',
            'CLOSE': 'CLOSE',
            'HIGH': 'HIGH',
            'OPEN': 'OPEN',
            'LOW': 'LOW',
            'VOLUME': 'VOLUME'
        })
        
        # Ensure 'DATE' is the index
        if 'DATE' in df.columns:
            df = df.set_index('DATE')
        
        # For each date in the dataframe
        for date, row in df.iterrows():
            new_row = {'ID': id, 'DATE': date}
            
            # Add data for each expected column
            for col in expected_columns:
                if col in df.columns:
                    new_row[col] = row[col]
                else:
                    new_row[col] = None  # or pd.NA for pandas nullable type
            
            rows.append(new_row)
    
    # Create the master dataframe
    master_df = pd.DataFrame(rows)
    
    # Set the column order
    column_order = ['ID', 'DATE'] + expected_columns
    master_df = master_df[column_order]
    
    # Set 'DATE' as the index
    master_df = master_df.set_index('DATE')
    
    return master_df

In [None]:
df = create_master_dataframe(ID_DATA)

In [None]:
df

In [None]:
df=df.drop(['LOW','OPEN'],axis=1)

In [None]:
df.loc[df.ID=='CONSUMER_SENTIMENT'].HIGH

In [None]:
equity_data=df.loc[df.ID.isin(IDS)]

In [None]:
df.isna().sum()

In [None]:
df['VOlUME']=df.VOLUME.fillna(0)

In [None]:
del df['VOlUME']

In [None]:
df['VOLUME'].fillna(0, inplace=True)

In [None]:
df

In [None]:
print(sector_betas.index, df.ID.unique())

In [None]:
sector_ticker_dict = {
    'Technology': ['NVDA', 'AAPL'],
    'Financial': ['JPM'],
    'Healthcare': ['PFE', 'JNJ'],
    'Consumer_Discretionary': ['AMZN', 'TSLA'],
    'Consumer_Staples': ['COST'],
    'Energy': ['XOM'],
    'Utilities': ['SO'],
    'Materials': ['GLD'],
    'Industrial': ['GE'],
    'Real_Estate': ['AMT'],
    'Communication_Services': ['T', 'NFLX']
}

In [None]:
df.reset_index(inplace=True)
equity_data.reset_index(inplace=True)

In [None]:
df[df.ID=='TREASURY_5Y'].HIGH

In [None]:

# List of macro features to add
macro_features = ['INFLATION_EXPECTATION', 'UNEMPLOYMENT_PROXY', 'US_ECONOMY', 'GOVT_DEBT_PROXY', 
                  'TREASURY_10Y', 'TREASURY_5Y', 'TREASURY_2Y', 'US_DOLLAR', 'GOLD', 'OIL', 
                  'VIX', 'REAL_ESTATE', 'CONSUMER_SENTIMENT']

macro_data = df[df.ID.isin(macro_features)]

macro_data=macro_data[['DATE','ID','HIGH']]
macro_data

In [None]:
equity_data['DATE'] = pd.to_datetime(equity_data['DATE'])
macro_data['DATE'] = pd.to_datetime(macro_data['DATE'])

# Reshape macro_data from long to wide format
macro_data_wide = macro_data.pivot(index='DATE', columns='ID', values='HIGH')
macro_data_wide.columns = ['MACRO_' + col for col in macro_data_wide.columns]  # Prefix macro columns
macro_data_wide.columns.name = None  # Remove the name from the columns index

# List of macro features (should match the unique IDs in macro_data)
macro_features = ['INFLATION_EXPECTATION', 'UNEMPLOYMENT_PROXY', 'US_ECONOMY', 'GOVT_DEBT_PROXY', 
                  'TREASURY_10Y', 'TREASURY_5Y', 'TREASURY_2Y', 'US_DOLLAR', 'GOLD', 'OIL', 
                  'VIX', 'REAL_ESTATE', 'CONSUMER_SENTIMENT']

# Ensure all expected features are present
for feature in macro_features:
    if f'MACRO_{feature}' not in macro_data_wide.columns:
        print(f"Warning: {feature} not found in macro data")

# Function to add macro features to a single ID's data
def add_macro_features(group):
    return pd.merge(group, macro_data_wide, left_on='DATE', right_index=True, how='left')

# Apply the function to each ID group
equity_data_with_macro = equity_data.groupby('ID', group_keys=False).apply(add_macro_features).reset_index(drop=True)

In [None]:
equity_data_with_macro[equity_data_with_macro.ID=='AAPL'].MACRO_INFLATION_EXPECTATION

In [None]:
equity_data_with_macro[equity_data_with_macro.ID=='NVDA'].MACRO_INFLATION_EXPECTATION

In [None]:
equity_data_with_macro.dropna(inplace=True)

In [None]:
ID_DATA['BETA_6M'].columns

In [None]:
equity_data_with_macro

In [None]:
# Group by ID and shift the data by one day
equity_data_with_macro = equity_data_with_macro.groupby('ID').apply(lambda x: x.shift(-1)).reset_index(drop=True)

# Remove the last row for each ID (which will be NaN after shifting)
equity_data_with_macro = equity_data_with_macro.groupby('ID').apply(lambda x: x.iloc[:-1]).reset_index(drop=True)

for sector, tickers in sector_ticker_dict.items():
    beta_values = ID_DATA['BETA_6M'][sector.upper()].values
    for ticker in tickers:
        ticker_mask = equity_data_with_macro['ID'] == ticker
        ticker_length = ticker_mask.sum()
        
        if ticker_length > 0:
            # Ensure beta_values matches the length of the ticker data
            adjusted_beta_values = np.resize(beta_values, ticker_length)
            equity_data_with_macro.loc[ticker_mask, 'BETA_TS'] = adjusted_beta_values
        else:
            print(f"No data found for ticker {ticker}")

# Check the result
print(equity_data_with_macro[['ID', 'BETA_TS']].head(10))
print(equity_data_with_macro['BETA_TS'].isna().sum())

In [None]:
equity_data_with_macro.loc[equity_data_with_macro.ID=='AAPL']

In [None]:
equity_data_with_macro.dropna(inplace=True)

In [None]:
equity_data_with_macro