In [None]:
import os
import holidays 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pandas_ta as ta
import numpy as np
from statsmodels.tsa.stattools import adfuller

data = '2012-12-30'

path_stock = "../../data/stock"
path_fed = "../../data/fed"

# Top 10 Tech Companies based on Market Cap

In [None]:
AAPL_df = pd.read_csv(f"{path_stock}/AAPL_stock.csv")
MSFT_df = pd.read_csv(f"{path_stock}/MSFT_stock.csv")
GOOGL_df = pd.read_csv(f"{path_stock}/GOOGL_stock.csv")
NVDA_df = pd.read_csv(f"{path_stock}/NVDA_stock.csv")
AMZN_df = pd.read_csv(f"{path_stock}/AMZN_stock.csv")
META_df = pd.read_csv(f"{path_stock}/META_stock.csv")
TSLA_df = pd.read_csv(f"{path_stock}/TSLA_stock.csv")
AVGO_df = pd.read_csv(f"{path_stock}/AVGO_stock.csv")
AMD_df = pd.read_csv(f"{path_stock}/AMD_stock.csv")
CRM_df = pd.read_csv(f"{path_stock}/CRM_stock.csv")

## Tach companies stock Data Frame processing
- Remove the null / header
- Make some features Engineering
- Change the column name
- Change the time type

In [None]:
import pandas as pd

def process_stock_data(df, ticker_symbol):
    """
    Processes a stock data DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with stock data (Price, Close, High, Low, Open, Volume, Ticker).
        ticker_symbol (str): Stock ticker symbol (e.g., 'AAPL').

    Returns:
        pd.DataFrame: Processed DataFrame with calculated features and renamed columns.
    """
    df.dropna(inplace=True)  # Remove rows containing any missing values.

    columns_to_convert = ['Close', 'High', 'Low', 'Open', 'Volume']
    df[columns_to_convert] = df[columns_to_convert].astype(float)  # Convert specified price/volume columns to floating-point numbers.

    # Calculate new features based on price data:
    df["delta_price"] = df["High"] - df["Low"]  # Calculate the difference between the high and low price for each day.
    df["avg_price"] = (df["Close"] + df["High"] + df["Low"] + df["Open"]) / 4  # Calculate the average of the close, high, low, and open prices.
    df["price_ratio"] = df["delta_price"] / df["avg_price"]  # Calculate the ratio of the delta price to the average price.
    df["invest"] = df["Volume"] * df["avg_price"]  # Calculate the difference between the trading volume and the average price (note: this might not be a standard financial metric and could be re-evaluated).

    # Rename the columns for clarity and to include the ticker symbol:
    df.rename(columns={"Price": "date",  # Rename the 'Price' column to 'date'.
                        "Close": f"close_{ticker_symbol}",  # Rename 'Close' to 'cl_ticker'.
                        "High": f"high_{ticker_symbol}",  # Rename 'High' to 'hi_ticker'.
                        "Low": f"low_{ticker_symbol}",  # Rename 'Low' to 'lo_ticker'.
                        "Open": f"open_{ticker_symbol}",  # Rename 'Open' to 'op_ticker'.
                        "delta_price": f"delta_price_{ticker_symbol}",  # Rename 'delta_price' to 'de_ticker'.
                        "avg_price": f"avg_price_{ticker_symbol}",  # Rename 'avg_price' to 'av_ticker'.
                        "invest": f"invest_{ticker_symbol}",  # Rename 'invest' to 'va_ticker'.
                        "price_ratio": f"price_ratio_{ticker_symbol}",  # Rename 'ratio' to 'ra_ticker'.
                        'Volume': f'volume_{ticker_symbol}'}, inplace=True)  # Rename 'Volume' to 'Vo_ticker'.

    df['date'] = pd.to_datetime(df['date'])  # Convert the 'date' column to datetime objects for proper time series handling.

    df.reset_index(drop=True, inplace=True)  # Reset the DataFrame's index to a default integer index and drop the original index.

    # Drop the 'Ticker' column as the ticker information is now embedded in the column names:
    if 'Ticker' in df.columns:
        df.drop('Ticker', axis=1, inplace=True)

    return df  # Return the processed DataFrame.

### Tech companies stock clean Data Frame 

In [None]:
AAPL_clean_df = process_stock_data(AAPL_df, 'AAPL')
MSFT_clean_df = process_stock_data(MSFT_df, 'MSFT')
GOOGL_clean_df = process_stock_data(GOOGL_df, 'GOOGL')
NVDA_clean_df = process_stock_data(NVDA_df, 'NVDA')
AMZN_clean_df = process_stock_data(AMZN_df, 'AMZN')
META_clean_df = process_stock_data(META_df, 'META')
TSLA_clean_df = process_stock_data(TSLA_df, 'TSLA')
AVGO_clean_df = process_stock_data(AVGO_df, 'AVGO')
AMD_clean_df = process_stock_data(AMD_df, 'AMD')
CRM_clean_df = process_stock_data(CRM_df, 'CRM')

###  Find the Max and Min od Data column in each companies stock Data Frame

In [None]:
stock_data_ranges = {}

dataframes = {
    "AAPL": AAPL_clean_df,
    "MSFT": MSFT_clean_df,
    "GOOGL": GOOGL_clean_df,
    "NVDA": NVDA_clean_df,
    "AMZN": AMZN_clean_df,
    "META": META_clean_df,
    "TSLA": TSLA_clean_df,
    "AVGO": AVGO_clean_df,
    "AMD": AMD_clean_df,
    "CRM": CRM_clean_df,
}

for name, df in dataframes.items():
    if 'date' in df.columns:
        min_date = df['date'].min()
        max_date = df['date'].max()
        stock_data_ranges[name] = {'min_date': min_date, 'max_date': max_date}
    else:
        print(f"Warning: 'date' column not found in {name}_clean_df")

# Create a Pandas DataFrame to display the results
date_range_df = pd.DataFrame.from_dict(stock_data_ranges, orient='index')
date_range_df.index.name = 'Stock'

print(date_range_df)

In above result , It seems that the META is started from 2012 while almost the others started from 2000.

## Macro Indicators from Yahoo Finance:
- Indices
- Commodities
- Sector ETFs (Proxies)
- Other Market Metrics

In [None]:
macro_df = pd.read_csv(f"{path_stock}/macro_indicators_full.csv")
# Convert the 'date' column to datetime objects
macro_df['Date'] = pd.to_datetime(macro_df['Date'])
macro_df.rename(columns={"Date": "date"}, inplace=True)

In [None]:
macro_df.isnull().sum()

Data Frame : macro_df ---> Has some missing values that need to be check according to the time. 

Let Filter the time after the '2012-05-31'. This is exactly after the time which we have the META stock data frame. 

In [None]:
macro_df_filter = macro_df[macro_df['date'] > data ]
min_date_macro_df_filter = macro_df_filter['date'].min()
max_date_macro_df_filter = macro_df_filter['date'].max()
macro_df_filter.isnull().sum()

In [None]:
#macro_df_filter = macro_df_filter.drop('Brent_Crude_Futures',axis=1)

In [None]:
df = macro_df_filter
data_name = 'macro_df_filter'
# 1. Matrix Plot: Visualize the pattern of missingness
plt.figure(figsize=(10, 6))
msno.matrix(df)
plt.title(f'Missing Value Matrix - {data_name}')
plt.show()

In [None]:
macro_clean_df = macro_df_filter.dropna()
macro_clean_df.isnull().sum()

# Fed Data frame

In [None]:
fed_df = pd.read_csv(f"{path_fed}/combined_economic_indicators.csv")

# Rename the 'Unnamed: 0' column to 'date'
fed_df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)

# Convert the 'date' column to datetime objects
fed_df['date'] = pd.to_datetime(fed_df['date'])

In [None]:
fed_df.isnull().sum()

In [None]:
fed_df_filter = fed_df[fed_df['date'] > data]
min_date_fed_df_filter = fed_df_filter['date'].min()
max_date_fed_df_filter = fed_df_filter['date'].max()
fed_df_filter.isnull().sum()

In [None]:
df = fed_df_filter
data_name = 'fed_df_filter'
# 1. Matrix Plot: Visualize the pattern of missingness
plt.figure(figsize=(10, 6))
msno.matrix(df)
plt.title(f'Missing Value Matrix - {data_name}')
plt.show()

In [None]:
fed_clean_df = fed_df_filter[['date', 'cpi', 'fed_rate', 'consumer_confidence','vix', 'oil', 'nonfarm_payrolls',
       'treasury_yield', 'industrial_production', 'retail_sales', 'pmi',
        'day_of_week', 'is_holiday', 'is_working_day']].dropna()
fed_clean_df.isnull().sum()

# Merging the Date Frames

In [None]:
# Start with the first DataFrame
merged_stock_data = AAPL_clean_df.copy()

# List of stock DataFrames (excluding the first one)
stock_dfs = [MSFT_clean_df, GOOGL_clean_df, NVDA_clean_df, AMZN_clean_df,
             META_clean_df, TSLA_clean_df, AVGO_clean_df, AMD_clean_df, CRM_clean_df]

# Merge each stock DataFrame on 'date' using a left join
for df in stock_dfs:
    merged_stock_data = pd.merge(merged_stock_data, df, on='date', how='inner')

# 2. Merge with Macro and Fed DataFrames

# Merge stock data with macro data
merged_data = pd.merge(merged_stock_data, macro_clean_df, on='date', how='inner')

# Merge with fed data
final_merged_df = pd.merge(merged_data, fed_clean_df, on='date', how='inner')

stock_df = merged_stock_data
stock_macro_df = merged_data
stock_macro_fed_df = final_merged_df

# Top 10 Companies Investment over Time

In [None]:
stock_plot = stock_macro_fed_df[[
    'date',
    'invest_AAPL','invest_MSFT','invest_GOOGL','invest_NVDA','invest_AMZN',
    'invest_META','invest_TSLA','invest_AVGO','invest_AMD','invest_CRM',
]]

plt.figure(figsize=(12, 6))  # Adjust figure size for better date visibility

# Plot each investment column against 'date'
for column in stock_plot.columns:
    if column != 'date':  # Exclude the 'date' column from the y-axis
        plt.plot(stock_plot['date'], stock_plot[column], label=column)

plt.xlabel('Date')
plt.ylabel('Investment Value')
plt.title('Investment over Time')
plt.legend(loc='upper left')  # Add legend to distinguish lines
plt.grid(True)
plt.tight_layout()  # Adjust layout to prevent overlapping labels
plt.show()


# Federal Indicators Over Time 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

fed_date = stock_macro_fed_df[[
    'date', 'cpi', 'fed_rate', 'consumer_confidence', 'vix', 'oil',
    'nonfarm_payrolls', 'treasury_yield', 'industrial_production', 'retail_sales', 'pmi'
]]

# Set 'date' as index for easier plotting
if 'date' in fed_date.columns:
    fed_date['date'] = pd.to_datetime(fed_date['date'])
    fed_date.set_index('date', inplace=True)
else:
    print("Error: 'date' column not found in fed_date DataFrame.")
    exit()

# --- Assess Data Ranges and Scales ---
print("Data Ranges and Scales:")
for column in fed_date.columns:
    print(f"- {column}: Range [{fed_date[column].min():.2f}, {fed_date[column].max():.2f}], Scale: {np.ptp(fed_date[column]):.2f}")

# --- Updated Plotting with Adjusted Scale ---
plt.figure(figsize=(15, 8))  # Increased figure size for better readability

for column in fed_date.columns:
    plt.plot(fed_date.index, fed_date[column], label=column)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Federal Indicators Over Time (Raw Scale)')
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Plotting with Standardized Scale ---
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
fed_date_scaled = fed_date.copy()
fed_date_scaled[fed_date_scaled.columns] = scaler.fit_transform(fed_date_scaled)

plt.figure(figsize=(15, 8))

for column in fed_date_scaled.columns:
    plt.plot(fed_date_scaled.index, fed_date_scaled[column], label=column)

plt.xlabel('Date')
plt.ylabel('Standardized Value (Mean=0, Std=1)')
plt.title('Federal Indicators Over Time (Standardized Scale)')
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Macro Indicators Over Time 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

macro_date = stock_macro_fed_df[[
    'date', 'S&P500_Index', 'Dow_Jones_Index', 'NASDAQ_Composite',
    'Russell2000_Index', 'VIX_Index', 'Dollar_Index_DXY', 'Gold_Futures',
    'WTI_Oil_Futures', 'Copper_Futures', 'Brent_Crude_Futures',
    'Tech_Sector_ETF', 'Energy_Sector_ETF', 'Financial_Sector_ETF',
    'ConsumerDiscretionary_ETF', 'Lithium_ETF', 'Semiconductor_ETF',
    'Electricity_Proxy'
]]

# Set 'date' as index for easier plotting
if 'date' in macro_date.columns:
    macro_date['date'] = pd.to_datetime(macro_date['date'])
    macro_date.set_index('date', inplace=True)
else:
    print("Error: 'date' column not found in macro_date DataFrame.")
    exit()

# --- Assess Data Ranges and Scales ---
print("Data Ranges and Scales:")
for column in macro_date.columns:
    print(f"- {column}: Range [{macro_date[column].min():.2f}, {macro_date[column].max():.2f}], Scale: {np.ptp(macro_date[column]):.2f}")

# --- Updated Plotting with Adjusted Scale ---
plt.figure(figsize=(20, 10))  # Increased figure size for better readability

for column in macro_date.columns:
    plt.plot(macro_date.index, macro_date[column], label=column)

plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Macro Indicators Over Time (Raw Scale)')
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Plotting with Standardized Scale ---
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
macro_date_scaled = macro_date.copy()
macro_date_scaled[macro_date_scaled.columns] = scaler.fit_transform(macro_date_scaled)

plt.figure(figsize=(20, 10))

for column in macro_date_scaled.columns:
    plt.plot(macro_date_scaled.index, macro_date_scaled[column], label=column)

plt.xlabel('Date')
plt.ylabel('Standardized Value (Mean=0, Std=1)')
plt.title('Macro Indicators Over Time (Standardized Scale)')
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Feature Engineering

In [None]:
# -------------------------------------------------------
# 📦 Feature Engineering
# -------------------------------------------------------

# 🛠️ 1. Investment Total and Individual Stock Investment Ratios
# -------------------------------------------------------

# ➡️ Calculate total daily investment across top 10 tech stocks
stock_macro_fed_df["invest_total"] = (
    stock_macro_fed_df["invest_AAPL"] + stock_macro_fed_df["invest_MSFT"] + stock_macro_fed_df["invest_GOOGL"] +
    stock_macro_fed_df["invest_NVDA"] + stock_macro_fed_df["invest_AMZN"] + stock_macro_fed_df["invest_META"] +
    stock_macro_fed_df["invest_TSLA"] + stock_macro_fed_df["invest_AVGO"] + stock_macro_fed_df["invest_AMD"] +
    stock_macro_fed_df["invest_CRM"]
)

# ➡️ Calculate each company's investment share (ratio)
for stock in ['AAPL', 'MSFT', 'GOOGL', 'NVDA', 'AMZN', 'META', 'TSLA', 'AVGO', 'AMD', 'CRM']:
    stock_macro_fed_df[f"invest_{stock}_ratio"] = stock_macro_fed_df[f"invest_{stock}"] / stock_macro_fed_df["invest_total"]

# -------------------------------------------------------
# 🛠️ 2. Time Features Extraction
# -------------------------------------------------------

# ➡️ Extract time-based features from 'date'
stock_macro_fed_df['day_of_week'] = stock_macro_fed_df['date'].dt.dayofweek        # 0 = Monday, 6 = Sunday
stock_macro_fed_df['month'] = stock_macro_fed_df['date'].dt.month                  # 1 = January, 12 = December
stock_macro_fed_df['week_number'] = stock_macro_fed_df['date'].dt.isocalendar().week  # ISO week number (1-53)
stock_macro_fed_df['is_month_end'] = stock_macro_fed_df['date'].dt.is_month_end.astype(int)  # 1 if last trading day of month

# -------------------------------------------------------
# 🛠️ 3. Set Up DataFrame for Feature Engineering
# -------------------------------------------------------

# ➡️ Make a working copy
df = stock_macro_fed_df.copy()

# ➡️ Ensure 'date' is datetime type and set it as index
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# -------------------------------------------------------
# 🛠️ 4. First Differencing for Macroeconomic and Indices Variables
# -------------------------------------------------------

# ➡️ Columns related to macroeconomics, indices, ETFs
macro_and_indices_cols = [
    'cpi', 'fed_rate', 'consumer_confidence', 'vix', 'oil', 'nonfarm_payrolls', 'treasury_yield',
    'industrial_production', 'retail_sales', 'pmi',
    'S&P500_Index', 'Dow_Jones_Index', 'NASDAQ_Composite', 'Russell2000_Index', 'VIX_Index',
    'Dollar_Index_DXY', 'Gold_Futures', 'WTI_Oil_Futures', 'Copper_Futures', 'Brent_Crude_Futures',
    'Tech_Sector_ETF', 'Energy_Sector_ETF', 'Financial_Sector_ETF', 'ConsumerDiscretionary_ETF',
    'Lithium_ETF', 'Semiconductor_ETF', 'Electricity_Proxy'
]

# ➡️ Apply first differencing to remove trend
for col in macro_and_indices_cols:
    if col in df.columns:
        df[f'{col}_diff'] = df[col].diff()

# -------------------------------------------------------
# 🛠️ 5. Stock Price Feature Engineering
# -------------------------------------------------------

stocks = ['AAPL', 'MSFT', 'GOOGL', 'NVDA', 'AMZN', 'META', 'TSLA', 'AVGO', 'AMD', 'CRM']

for stock in stocks:
    for field in ['close', 'open', 'high', 'low']:
        col = f'{field}_{stock}'
        if col in df.columns:
            # ➡️ Differencing for stationarity
            df[f'{col}_diff'] = df[col].diff()
            # ➡️ Rolling mean and std dev
            df[f'{col}_rolling_mean_5'] = df[col].rolling(window=5).mean()
            df[f'{col}_rolling_std_5'] = df[col].rolling(window=5).std()
            df[f'{col}_rolling_mean_20'] = df[col].rolling(window=20).mean()
            df[f'{col}_rolling_std_20'] = df[col].rolling(window=20).std()
            # ➡️ Create lag features
            for lag in [1, 3, 5, 10]:
                df[f'{col}_lag_{lag}'] = df[col].shift(lag)

    # ➡️ Technical Indicators (RSI and MACD)
    close_col = f'close_{stock}'
    if close_col in df.columns:
        df[f'{stock}_RSI'] = ta.rsi(df[close_col], length=14)
        macd = ta.macd(df[close_col])
        if macd is not None:
            df[f'{stock}_MACD'] = macd['MACD_12_26_9']
            df[f'{stock}_MACD_signal'] = macd['MACDs_12_26_9']
            df[f'{stock}_MACD_hist'] = macd['MACDh_12_26_9']

    # ➡️ Volume-based Features
    vol_col = f'volume_{stock}'
    if vol_col in df.columns:
        df[f'{vol_col}_log'] = np.log1p(df[vol_col])
        df[f'{vol_col}_diff'] = df[f'{vol_col}_log'].diff()

    # ➡️ Price Delta, Avg, Ratio, Investment Differencing
    for suffix in ['delta_price', 'avg_price', 'price_ratio', 'invest']:
        derived_col = f'{suffix}_{stock}'
        if derived_col in df.columns:
            df[f'{derived_col}_diff'] = df[derived_col].diff()

# -------------------------------------------------------
# 🛠️ 6. Portfolio Level Investment Features
# -------------------------------------------------------

portfolio_cols = [
    'invest_total', 'invest_AAPL_ratio', 'invest_MSFT_ratio', 'invest_GOOGL_ratio',
    'invest_NVDA_ratio', 'invest_AMZN_ratio', 'invest_META_ratio',
    'invest_TSLA_ratio', 'invest_AVGO_ratio', 'invest_AMD_ratio', 'invest_CRM_ratio'
]

# ➡️ Apply first differencing to investment ratios
for col in portfolio_cols:
    if col in df.columns:
        df[f'{col}_diff'] = df[col].diff()

# -------------------------------------------------------
# 🛠️ 7. Additional Time and Investment Trend Features
# -------------------------------------------------------


# ➡️ Quarter, Year, Quarter-End, Year-End flags
df['quarter'] = df.index.quarter
df['year'] = df.index.year
df['month'] = df.index.month
df['is_quarter_end'] = df.index.is_quarter_end.astype(int)
df['is_year_end'] = df.index.is_year_end.astype(int)

#(Optional) Standardize Day of Week
# If 'day_of_week' is a string like "Monday", map to numeric
day_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2,
               'Thursday': 3, 'Friday': 4}
if df['day_of_week'].dtype == 'object':
    df['day_of_week_num'] = df['day_of_week'].map(day_mapping)


# ➡️ Lagged and Rolling Investment Total
df['invest_total_lag_1'] = df['invest_total'].shift(1)
df['invest_total_diff'] = df['invest_total'].diff()
df['invest_total_rolling_mean_5'] = df['invest_total'].rolling(window=5).mean()

# -------------------------------------------------------
# 🧹 8. Final Cleaning Step
# -------------------------------------------------------

# ➡️ Drop rows with NaN values caused by differencing, rolling, and shifting
df.dropna(inplace=True)


# Correlation Matrix

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

check_corr_df = df[[
    'close_AAPL', 'close_MSFT', 'close_GOOGL', 'close_NVDA', 'close_AMZN',
    'close_META', 'close_TSLA', 'close_AVGO', 'close_AMD', 'close_CRM',
    'S&P500_Index', 'Dow_Jones_Index', 'NASDAQ_Composite',
    'Russell2000_Index', 'VIX_Index', 'Dollar_Index_DXY', 'Gold_Futures',
    'WTI_Oil_Futures', 'Copper_Futures', 'Brent_Crude_Futures',
    'Tech_Sector_ETF', 'Energy_Sector_ETF', 'Financial_Sector_ETF',
    'ConsumerDiscretionary_ETF', 'Lithium_ETF', 'Semiconductor_ETF',
    'Electricity_Proxy',
    'cpi', 'fed_rate', 'consumer_confidence', 'vix', 'oil',
    'nonfarm_payrolls', 'treasury_yield', 'industrial_production',
    'retail_sales', 'pmi'
]]

corr_df = check_corr_df.corr()

# 1. Create a Mask for the Upper Triangle
mask = np.triu(np.ones_like(corr_df, dtype=bool))

# 2. Create a Styled Correlation Matrix with Lower Triangle and Meaningful Colors
def styled_lower_triangle_heatmap(df, title="Correlation Matrix"):
    """
    Displays a styled correlation matrix with only the lower triangle visible.

    Args:
        df (pd.DataFrame): The correlation matrix.
        title (str, optional): The title of the heatmap. Defaults to "Correlation Matrix".
    """

    styled_df = df.style.format(precision=2).background_gradient(
        cmap=sns.diverging_palette(220, 10, as_cmap=True),  # Use a diverging color palette
        axis=None,
    ).set_properties(**{'font-size': '8pt'})  # Adjust font size as needed

    # Apply the mask to hide the upper triangle (does not work directly with style)
    masked_df = df.copy()
    masked_df[mask] = np.nan  # Replace upper triangle with NaN

    # Convert to a DataFrame suitable for display (including index as a column)
    display_df = masked_df.reset_index().melt('index')
    display_df.dropna(inplace=True)  # Remove NaN values

    # Create the heatmap using seaborn for better control
    plt.figure(figsize=(20, 20))  # Adjust figure size as needed
    sns.heatmap(
        masked_df,
        annot=True,  # Display correlation values in the heatmap
        fmt=".2f",   # Format of the annotations
        cmap=sns.diverging_palette(220, 10, as_cmap=True),
        mask=mask,  # Apply the mask in seaborn
        cbar=True,  # Show the color bar
        square=True, # Ensure the heatmap cells are square
    )
    plt.title(title, fontsize=16)
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
    plt.yticks(rotation=0)
    plt.show()

    return styled_df

styled_corr_df = styled_lower_triangle_heatmap(corr_df, title="Lower Triangle Correlation Matrix")

# 3. Print the Styled Correlation Matrix (Table)
print("\nStyled Correlation Matrix (Lower Triangle):\n")
display(styled_corr_df)  # Use display() in a notebook environment

# 📦 Advanced Time Series Decomposition (Trend, Seasonality, Residuals)

In [None]:
# -------------------------------------------------------
# 📦 Advanced Time Series Decomposition: Trend, Seasonality, Residual
# -------------------------------------------------------

from statsmodels.tsa.seasonal import STL

# --------------------------
# 🛠️ Parameters
# --------------------------

# ➡️ Choose important columns to decompose (you can expand this list)
columns_to_decompose = [
    'close_AAPL', 'close_MSFT', 'close_GOOGL', 'close_NVDA', 'close_AMZN',
    'close_META', 'close_TSLA', 'close_AVGO', 'close_AMD', 'close_CRM',
    'S&P500_Index', 'Dow_Jones_Index', 'NASDAQ_Composite',
    'Russell2000_Index', 'VIX_Index', 'Dollar_Index_DXY', 'Gold_Futures',
    'WTI_Oil_Futures', 'Copper_Futures', 'Brent_Crude_Futures',
    'Tech_Sector_ETF', 'Energy_Sector_ETF', 'Financial_Sector_ETF',
    'ConsumerDiscretionary_ETF', 'Lithium_ETF', 'Semiconductor_ETF',
    'Electricity_Proxy',
    'cpi', 'fed_rate', 'consumer_confidence', 'vix', 'oil',
    'nonfarm_payrolls', 'treasury_yield', 'industrial_production',
    'retail_sales', 'pmi'
]

# ➡️ Set seasonality period
# (252 trading days ≈ 1 year seasonality for stock data)
seasonal_period = 252

# --------------------------
# 🛠️ Decomposition
# --------------------------

# Loop through each selected feature
for col in columns_to_decompose:
    if col in df.columns:
        try:
            # ➡️ Apply STL decomposition
            stl = STL(df[col].dropna(), period=seasonal_period)
            result = stl.fit()
            
            # ➡️ Save trend, seasonal, and residual components as new columns
            df[f'{col}_trend'] = result.trend
            df[f'{col}_seasonal'] = result.seasonal
            df[f'{col}_residual'] = result.resid
            
            print(f"✅ Decomposed: {col}")
        
        except Exception as e:
            print(f"❌ Error decomposing {col}: {e}")
    else:
        print(f"⚠️ Column not found: {col}")

# --------------------------
# 🧹 Cleaning (optional)
# --------------------------

# ➡️ After decomposition, drop rows with NaN if needed (optional)
df.dropna(inplace=True)
# (Because decomposition will create NaNs at the start and end due to windowing)

## 📊 STL Decomposition Summary

This section decomposes the selected time series into three components using **STL (Seasonal-Trend Decomposition using Loess)**:

**Formula:**

\[
\text{Original}_t = \text{Trend}_t + \text{Seasonality}_t + \text{Residual}_t
\]

- **Original**: The raw time series (e.g., `close_AAPL`)
- **Trend**: Long-term smooth movement (extracted via Loess)
- **Seasonality**: Repeating patterns (e.g., yearly cycles, period = 252 trading days)
- **Residual**: Leftover noise after removing trend and seasonality

Each plot visualizes:
1. Original time series
2. Trend component
3. Seasonal component
4. Residual (random noise)

This decomposition helps us:
- Understand underlying structure
- Remove noise or seasonality before modeling
- Check if residuals are stationary for forecasting



In [None]:
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import STL

# Choose columns to decompose
columns_to_decompose = [
    'close_AAPL', 'close_MSFT', 'close_GOOGL', # you can add more
    'S&P500_Index', 'Gold_Futures', 'cpi' # examples
]

# Set seasonality period
seasonal_period = 252

# Loop through each selected feature
for col in columns_to_decompose:
    if col in df.columns:
        try:
            # Apply STL decomposition
            stl = STL(df[col].dropna(), period=seasonal_period)
            result = stl.fit()
            
            # Save trend, seasonal, residual
            df[f'{col}_trend'] = result.trend
            df[f'{col}_seasonal'] = result.seasonal
            df[f'{col}_residual'] = result.resid
            
            print(f"✅ Decomposed and plotting: {col}")
            
            # --- Plotting each component separately ---
            fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
            fig.suptitle(f"Decomposition of {col}", fontsize=16)

            axes[0].plot(df.index, df[col], label='Original', color='blue')
            axes[0].set_ylabel('Original')
            axes[0].legend()

            axes[1].plot(df.index, df[f'{col}_trend'], label='Trend', color='green')
            axes[1].set_ylabel('Trend')
            axes[1].legend()

            axes[2].plot(df.index, df[f'{col}_seasonal'], label='Seasonality', color='orange')
            axes[2].set_ylabel('Seasonal')
            axes[2].legend()

            axes[3].plot(df.index, df[f'{col}_residual'], label='Residual', color='red')
            axes[3].set_ylabel('Residual')
            axes[3].legend()

            plt.xlabel('Date')
            plt.tight_layout()
            plt.show()
            
        except Exception as e:
            print(f"❌ Error decomposing {col}: {e}")
    else:
        print(f"⚠️ Column not found: {col}")


# Stationarity Testing (ADF) for Time Series Modeling

In [None]:
# -------------------------------------------------------
# 📦 Double Stationarity Test: ADF + KPSS for All Columns
# -------------------------------------------------------

from statsmodels.tsa.stattools import adfuller, kpss

# --------------------------
# 🛠️ Parameters
# --------------------------

# ➡️ Columns to test
columns_to_test = df.select_dtypes(include='number').columns.tolist()

# ➡️ Prepare to collect results
stationarity_results = []

# --------------------------
# 🛠️ Testing Loop
# --------------------------

for col in columns_to_test:
    try:
        # ➡️ Drop NaN values
        series = df[col].dropna()

        # --- ADF Test ---
        adf_result = adfuller(series, autolag='AIC')
        adf_pvalue = adf_result[1]
        adf_stationary = adf_pvalue < 0.05  # Stationary if p-value < 0.05

        # --- KPSS Test ---
        kpss_result = kpss(series, regression='c', nlags="auto")
        kpss_pvalue = kpss_result[1]
        kpss_stationary = kpss_pvalue > 0.05  # Stationary if p-value > 0.05

        # --- Final Conclusion ---
        if adf_stationary and kpss_stationary:
            final_conclusion = True
        else:
            final_conclusion = False

        # ➡️ Append results
        stationarity_results.append({
            'Feature': col,
            'ADF p-value': round(adf_pvalue, 5),
            'ADF Stationary': adf_stationary,
            'KPSS p-value': round(kpss_pvalue, 5),
            'KPSS Stationary': kpss_stationary,
            'is_Stationary': final_conclusion
        })

    except Exception as e:
        print(f"⚠️ Error testing {col}: {e}")

# --------------------------
# 📜 Convert to DataFrame
# --------------------------

stationarity_df = pd.DataFrame(stationarity_results)

# ➡️ Display results
stationarity_df.sort_values('is_Stationary', ascending=False).reset_index(drop=True)



In [None]:
stationarity_df['is_Stationary'].value_counts()

# Advanced Stationarity Fix: Second Diff, Residuals, Log Returns

In [None]:
# -------------------------------------------------------
# 🧠 Advanced Stationarity Fix: Second Diff, Residuals, Log Returns
# -------------------------------------------------------

from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.seasonal import STL
import numpy as np

# -----------------------------------------------
# 1. Detect non-stationary features from previous results
# -----------------------------------------------

# Filter only non-stationary features
non_stationary_cols = stationarity_df[stationarity_df['is_Stationary'] == False]['Feature'].tolist()

# Prepare result collector
fixed_results = []

# -----------------------------------------------
# 2. Define Helper: Run ADF + KPSS and interpret
# -----------------------------------------------

def test_stationarity(series):
    try:
        adf_p = adfuller(series.dropna())[1]
    except:
        adf_p = np.nan
    try:
        kpss_p = kpss(series.dropna(), regression='c', nlags="auto")[1]
    except:
        kpss_p = np.nan
    adf_stat = adf_p < 0.05 if not np.isnan(adf_p) else False
    kpss_stat = kpss_p > 0.05 if not np.isnan(kpss_p) else False
    conclusion = True if adf_stat and kpss_stat else False
    return adf_p, adf_stat, kpss_p, kpss_stat, conclusion

# -----------------------------------------------
# 3. Loop through each non-stationary feature and fix
# -----------------------------------------------

for col in non_stationary_cols:
    try:
        series = df[col].dropna()

        # --- Try log return if column has only positive values
        if (series > 0).all():
            transformed = np.log(series / series.shift(1))
            method = "log_return"

        # --- Otherwise, try second differencing
        else:
            transformed = series.diff().diff()
            method = "second_diff"

        # --- If still looks trended, try residual from STL
        if transformed.dropna().std() == 0 or transformed.isna().mean() > 0.5:
            stl = STL(series, period=252)
            transformed = stl.fit().resid
            method = "stl_residual"

        # --- Run stationarity tests on transformed version
        adf_p, adf_ok, kpss_p, kpss_ok, conclusion = test_stationarity(transformed)

        # --- Append result
        fixed_results.append({
            'Feature': col,
            'Fix Method': method,
            'ADF p-value': round(adf_p, 5) if adf_p is not None else None,
            'ADF Stationary': adf_ok,
            'KPSS p-value': round(kpss_p, 5) if kpss_p is not None else None,
            'KPSS Stationary': kpss_ok,
            'is_Stationary': conclusion
        })

        # Save transformed back into df (optional)
        df[f'{col}_{method}'] = transformed

    except Exception as e:
        print(f"⚠️ Error processing {col}: {e}")

# -----------------------------------------------
# 4. Display Fixed Result Summary
# -----------------------------------------------

fixed_stationarity_df = pd.DataFrame(fixed_results)
fixed_stationarity_df.sort_values('is_Stationary', ascending=False).reset_index(drop=True)


In [None]:
stationarity_df['is_Stationary'].value_counts()

In [None]:
fixed_stationarity_df['is_Stationary'].value_counts()

In [None]:
# Save to CSV
df.to_csv("../../data/processed_combined_data.csv")
print("✅ Saved full macro data to 'data/processed_combined_data.csv'")