<a href="https://colab.research.google.com/github/henryonomakpo/The-Impact-of-ESG-Ratings-on-EV-Manufacturing-Industry/blob/main/E_commerce_ESG_Influence_on_Stock_Returns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# yfinance, statsmodels, pandas, numpy, scikit-learn, xlsxwriter, linearmodels
!pip install yesg
!pip install yfinance
!pip install statsmodels
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install xlsxwriter
!pip install linearmodels

Collecting yesg
  Downloading yesg-2.1.1.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: yesg
  Building wheel for yesg (setup.py) ... [?25l[?25hdone
  Created wheel for yesg: filename=yesg-2.1.1-py3-none-any.whl size=6105 sha256=43af6485ba679a75e078af4ae81e484e9ecb5475423e237318949f12746cd1b2
  Stored in directory: /root/.cache/pip/wheels/78/8d/48/f5e8ff0315a46301e15c68371e297b460b33e1c846117725bc
Successfully built yesg
Installing collected packages: yesg
Successfully installed yesg-2.1.1
Collecting xlsxwriter
  Downloading XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Downloading XlsxWriter-3.2.3-py3-none-any.whl (169 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.3
Collecting linearmodels
  Downloading linearmodels-6.1-cp311-cp311-manylinux_2_17_x86_64.m

### Fetch ESG dataset and save

In [9]:
# Required libraries: yesg, pandas
# Optional for Google Drive: google.colab
# !pip install yesg pandas

import yesg
import pandas as pd
import time # To add delays between API calls

# Attempt to import and use Google Drive specific libraries only if needed
try:
    from google.colab import drive
    google_colab_available = True
except ImportError:
    google_colab_available = False
    print("Google Colab environment not detected. Will save CSV locally.")

print("--- ESG Data Fetching Script for E-commerce Firms ---")

# --- Configuration ---

# List of tickers for E-commerce firms
TICKERS_ECOMMERCE = [
    'AMZN',   # Amazon
    'BABA',   # Alibaba (NYSE)
    'JD',     # JD.com
    'EBAY',   # eBay
    'WMT',    # Walmart
    'SE',     # Sea Limited (Shopee)
    'MELI',   # MercadoLibre
    'PDD',    # PDD Holdings
    'ETSY',   # Etsy
    'ZAL.DE', # Zalando
    'ALE.WA', # Allegro
    'TGT',    # Target
    '4755.T'  # Rakuten (Tokyo Stock Exchange)
]

# Define where to save the output file
DRIVE_MOUNT_PATH = '/content/drive'
OUTPUT_FILENAME = 'historic_esg_scores_ecommerce.csv'
OUTPUT_PATH_DRIVE = f'{DRIVE_MOUNT_PATH}/My Drive/{OUTPUT_FILENAME}' # Standard Google Drive path
OUTPUT_PATH_LOCAL = OUTPUT_FILENAME # Save in current directory if Drive fails

# Delay between API calls (in seconds) to avoid potential blocking
API_DELAY = 0.6

# --- Mount Google Drive (if in Colab) ---
drive_mounted = False
if google_colab_available:
    try:
        print(f"\nAttempting to mount Google Drive at {DRIVE_MOUNT_PATH}...")
        drive.mount(DRIVE_MOUNT_PATH)
        drive_mounted = True
        print("Google Drive mounted successfully.")
        save_path = OUTPUT_PATH_DRIVE
    except Exception as e:
        print(f"Failed to mount Google Drive: {e}")
        print(f"Output CSV will be saved locally as '{OUTPUT_PATH_LOCAL}'.")
        save_path = OUTPUT_PATH_LOCAL
else:
    # Not in Colab, saving locally
    save_path = OUTPUT_PATH_LOCAL

# --- Data Fetching Loop ---
print(f"\nTickers to fetch ESG data for: {TICKERS_ECOMMERCE}")
print("Starting ESG data download loop...")
print("WARNING: 'yesg' library relies on Yahoo Finance and may be outdated or have limited data coverage.")

# Initialize lists to store results and track progress
all_esg_data_list = []
successful_tickers = []
failed_tickers = []

for ticker in TICKERS_ECOMMERCE:
    print(f"  -> Processing: {ticker}")
    try:
        # Fetch all available historic ESG ratings for the ticker
        # Add the delay BEFORE the call
        time.sleep(API_DELAY)
        esg_scores_df = yesg.get_historic_esg(ticker)

        # Check if the result is a non-empty DataFrame
        if isinstance(esg_scores_df, pd.DataFrame) and not esg_scores_df.empty:
            # Add a column for the ticker symbol
            esg_scores_df['Ticker'] = ticker
            # Reset the index to make the date a column before appending
            esg_scores_df = esg_scores_df.reset_index()
            # Append the DataFrame to the list
            all_esg_data_list.append(esg_scores_df)
            successful_tickers.append(ticker)
            print(f"    -> Success: Found {len(esg_scores_df)} ESG data points for {ticker}.")
        else:
            # Handle cases where yesg returns None or an empty DataFrame
            print(f"    -> No valid ESG data found/returned for {ticker}")
            failed_tickers.append(ticker)
    except Exception as e:
        # Catch any other exceptions during fetching or processing
        print(f"    -> ERROR fetching/processing ESG data for {ticker}: {e}")
        failed_tickers.append(ticker)

# --- Combine and Save Data ---
if all_esg_data_list:
    print("\nCombining collected ESG data...")
    # Concatenate all the collected DataFrames into a single one
    final_esg_data = pd.concat(all_esg_data_list, ignore_index=True)

    # Standardize the date column name (it's often 'Date' or 'index' after reset_index)
    if 'index' in final_esg_data.columns and 'Date' not in final_esg_data.columns:
         final_esg_data = final_esg_data.rename(columns={'index': 'Date'})
    elif 'Date' not in final_esg_data.columns:
        print("Warning: Could not identify the primary date column after fetching. Please inspect the output.")

    # Attempt to convert Date column to datetime objects for consistency
    if 'Date' in final_esg_data.columns:
        try:
            final_esg_data['Date'] = pd.to_datetime(final_esg_data['Date'])
            print("  -> Date column converted to datetime.")
        except Exception as e:
            print(f"Warning: Could not convert 'Date' column to datetime format: {e}")

    # Display first few rows and info of the final DataFrame
    print("\nPreview of combined ESG data:")
    print(final_esg_data.head())
    print("\nData Info:")
    final_esg_data.info()

    # Save the combined data to the chosen CSV file path
    try:
        print(f"\nSaving ESG data to: {save_path} ...")
        final_esg_data.to_csv(save_path, index=False)
        print(f"ESG data saved successfully.")
    except Exception as e:
        print(f"\nERROR saving ESG data to CSV at '{save_path}': {e}")

else:
    # Message if no data was collected at all
    print("\nNo ESG data was successfully collected for any ticker. No CSV file created.")

# --- Final Summary ---
print("\n--- ESG Fetching Summary ---")
print(f"Successfully fetched ESG for ({len(successful_tickers)} tickers): {successful_tickers}")
print(f"Failed or no ESG data for ({len(failed_tickers)} tickers): {failed_tickers}")
print("--- Script Finished ---")

--- ESG Data Fetching Script for E-commerce Firms ---

Attempting to mount Google Drive at /content/drive...
Failed to mount Google Drive: Error: credential propagation was unsuccessful
Output CSV will be saved locally as 'historic_esg_scores_ecommerce.csv'.

Tickers to fetch ESG data for: ['AMZN', 'BABA', 'JD', 'EBAY', 'WMT', 'SE', 'MELI', 'PDD', 'ETSY', 'ZAL.DE', 'ALE.WA', 'TGT', '4755.T']
Starting ESG data download loop...
  -> Processing: AMZN
    -> Success: Found 128 ESG data points for AMZN.
  -> Processing: BABA
    -> Success: Found 128 ESG data points for BABA.
  -> Processing: JD
    -> Success: Found 98 ESG data points for JD.
  -> Processing: EBAY
    -> Success: Found 128 ESG data points for EBAY.
  -> Processing: WMT
    -> Success: Found 128 ESG data points for WMT.
  -> Processing: SE
    -> Success: Found 6 ESG data points for SE.
  -> Processing: MELI
    -> Success: Found 7 ESG data points for MELI.
  -> Processing: PDD
    -> Success: Found 7 ESG data points for PD

### Improved Model

### Robust Model
### Data Merging & Prep (Step 4): Carefully merges returns, factors, and loaded/lagged ESG data. It includes logic for forward-filling ESG scores before lagging and handles potential NaNs (either via imputation if IMPUTE_DATA=True or by dropping rows).

### VIF (Step 5): Checks multicollinearity on the final prepared panel data.

### *Panel Models (Step 6): Runs Pooled OLS, RE, FE (Entity), and FE (Two-Way) using appropriate formulas based on available variables after VIF checks.

### Specification Tests (Step 7): Performs Hausman and F-tests to compare models.



### Saving Results (Step 9): Saves all relevant outputs (panel summaries, tests, VIF, ML metrics, importance, predictions) to an Excel file.



In [13]:
# !pip install yfinance statsmodels pandas numpy linearmodels

# --- Core Libraries ---
import pandas as pd
import numpy as np
import yfinance as yf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from linearmodels.panel import PanelOLS, RandomEffects
from linearmodels.panel import compare as model_compare
from linearmodels.panel.results import PanelEffectsResults, RandomEffectsResults
import warnings
import sys
import re
import time
import traceback # For detailed error logging if needed
from datetime import datetime
from dateutil.relativedelta import relativedelta

# --- MICE Imputation Libraries (Kept) ---
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge # Needed for IterativeImputer estimator

# --- Removed Plotting Libraries ---

# --- Settings and Configuration ---
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
from statsmodels.tools.sm_exceptions import (ValueWarning, ConvergenceWarning,
                                             HessianInversionWarning, PerfectSeparationWarning,
                                             CollinearityWarning, PerfectSeparationError)
warnings.simplefilter('ignore', ValueWarning)
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter('ignore', HessianInversionWarning)
warnings.simplefilter('ignore', PerfectSeparationWarning)
warnings.simplefilter('ignore', CollinearityWarning)
warnings.filterwarnings("ignore", message="Variables are collinear")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="divide by zero encountered in scalar divide")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in scalar divide")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in divide")
from linearmodels.panel.utility import AbsorbingEffectWarning
warnings.filterwarnings("ignore", category=AbsorbingEffectWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

pd.set_option('display.width', 140)
pd.set_option('display.max_columns', 18)
pd.set_option('display.float_format', '{:.4f}'.format)

# --- Define Tickers for E-commerce Firms ---
TICKERS_ECOMMERCE = [
    'AMZN', 'BABA', 'JD', 'EBAY', 'WMT', 'SE', 'MELI', 'PDD',
    'ETSY', 'ZAL.DE', 'ALE.WA', 'TGT', '4755.T'
]
TICKER_NAMES = { # Optional mapping
    'AMZN': 'Amazon', 'BABA': 'Alibaba', 'JD': 'JD.com', 'EBAY': 'eBay', 'WMT': 'Walmart',
    'SE': 'Sea Ltd', 'MELI': 'MercadoLibre', 'PDD': 'PDD Holdings', 'ETSY': 'Etsy',
    'ZAL.DE': 'Zalando', 'ALE.WA': 'Allegro', 'TGT': 'Target', '4755.T': 'Rakuten'
}

# --- Define ESG Risk Categories (Time-Invariant) ---
esg_risk_categories = {
    'BABA': 'High', 'SE': 'High', 'PDD': 'High', 'ZAL.DE': 'High',
    'ALE.WA': 'High', '4755.T': 'High', 'AMZN': 'Middle', 'EBAY': 'Middle',
    'WMT': 'Middle', 'TGT': 'Middle', 'JD': 'Middle', 'MELI': 'Middle',
    'ETSY': 'Low',
}

# --- Define Date Range ---
START_DATE_PRICES = "2019-01-01"
END_DATE_PRICES = "2024-12-31"
START_DATE_ANALYSIS = "2020-01-01"
END_DATE_ANALYSIS = "2024-12-31"

# --- File Paths ---
FF_FACTORS_PATH = "gd_Developed_5_Factors.csv"
ESG_DATA_PATH = "historic_esg_scores_ecommerce.csv"

# --- Parameters ---
ESG_LAG_MONTHS = 1
VIF_THRESHOLD = 10
IMPUTE_DATA = True
RUN_WITHOUT_IMPUTATION_SENSITIVITY = True

# --- Version Control & Script Info ---
SCRIPT_VERSION = "Panel Only v13 - Final Check & Error Handling" # Updated version again
print(f"--- E-commerce ESG Impact Analysis Script Started ({SCRIPT_VERSION}) ---")
print(f"Tickers: {TICKERS_ECOMMERCE}")
print(f"Analysis Period: {START_DATE_ANALYSIS} to {END_DATE_ANALYSIS}")
print(f"ESG Lag: {ESG_LAG_MONTHS} months")
print(f"Imputation Enabled (Main Run - MICE): {IMPUTE_DATA}")
print(f"Run Sensitivity without Imputation: {RUN_WITHOUT_IMPUTATION_SENSITIVITY}")
print(f"Factors Path: {FF_FACTORS_PATH} (Source/Quality Not Verified by Script)")
print(f"ESG Data Path: {ESG_DATA_PATH} (Source/Quality Not Verified by Script)")


# --- Advanced Imputation Function (MICE-like using IterativeImputer) ---
def advanced_imputation(df_input):
    """
    Performs Iterative Imputation (MICE-like) on numeric columns of a DataFrame.
    Uses BayesianRidge as the estimator by default. Handles all-NaN columns.
    """
    df = df_input.copy()
    original_index = df.index; original_cols = df.columns
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    non_numeric_cols = df.select_dtypes(exclude=np.number).columns.tolist()

    if not numeric_cols: print("  -> Imputation: No numeric columns found."); return df_input
    df_numeric = df[numeric_cols].copy(); df_non_numeric = df[non_numeric_cols].copy()
    if df_numeric.isnull().sum().sum() == 0: print("  -> Imputation: No missing values detected."); return df_input

    print(f"  -> Imputation: Attempting Iterative Imputation (MICE) for {len(numeric_cols)} numeric columns.")
    n_features = len(numeric_cols); n_neighbors = min(5, n_features - 1) if n_features > 1 else 1
    all_nan_cols = df_numeric.columns[df_numeric.isnull().all()].tolist()
    if all_nan_cols:
        print(f"    -> Warning: All-NaN columns cannot be imputed: {all_nan_cols}")
        df_numeric_imputable = df_numeric.drop(columns=all_nan_cols); numeric_cols_imputable = df_numeric_imputable.columns.tolist()
        if not numeric_cols_imputable: print("    -> Error: No imputable numeric columns remain."); return df_input
        n_features = len(numeric_cols_imputable); n_neighbors = min(5, n_features - 1) if n_features > 1 else 1
    else: df_numeric_imputable = df_numeric; numeric_cols_imputable = numeric_cols
    if df_numeric_imputable.empty:
         print("    -> Error: No numeric columns available for imputation.")
         if all_nan_cols: df_all_nan = df_numeric[all_nan_cols]; df_out = pd.concat([df_all_nan, df_non_numeric], axis=1); return df_out[original_cols]
         else: return df_input

    imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42, tol=1e-3, n_nearest_features=n_neighbors, verbose=0, imputation_order='ascending')
    try:
        imputed_values = imputer.fit_transform(df_numeric_imputable)
        df_imputed_numeric = pd.DataFrame(imputed_values, columns=numeric_cols_imputable, index=df_numeric_imputable.index)
        if all_nan_cols:
             for col in all_nan_cols: df_imputed_numeric[col] = np.nan
        df_out = pd.concat([df_imputed_numeric, df_non_numeric], axis=1); df_out = df_out[original_cols]
        for col in non_numeric_cols:
             if col in df_out.columns:
                 try: df_out[col] = df_out[col].astype(df_input[col].dtype)
                 except Exception as type_err: print(f"    -> Warning: Restore dtype failed '{col}': {type_err}")
        print("  -> Imputation: MICE imputation completed.")
        remaining_nan_count = df_out[numeric_cols].isnull().sum().sum()
        if remaining_nan_count > 0: print(f"  -> !!! WARNING: {remaining_nan_count} NaNs remain post-imputation. !!!")
        return df_out
    except ValueError as ve: print(f"  -> Imputation ERROR (ValueError): {ve}. Check sparse data."); return df_input
    except Exception as e: print(f"  -> Imputation ERROR (General): {e}."); traceback.print_exc(limit=2); return df_input

# ==============================================================================
# --- Step 1: Download Stock Returns ---
# ==============================================================================
print("\n--- 1. Downloading and Preparing Stock Returns ---")
stock_monthly_returns = pd.DataFrame(); tickers_available_yf = []
try:
    tickers_to_download = TICKERS_ECOMMERCE.copy()
    all_stock_data = yf.download( tickers_to_download, start=START_DATE_PRICES, end=END_DATE_PRICES, progress=False, auto_adjust=False, actions=False, ignore_tz=True, group_by='ticker')
    if all_stock_data.empty: raise ValueError("No stock price data downloaded.")
    price_data_list = []; available_tickers_in_download = []
    if len(tickers_to_download) == 1:
        ticker = tickers_to_download[0]
        if not all_stock_data.empty:
            df_ticker = all_stock_data[['Adj Close']].copy()
            if df_ticker.empty or df_ticker['Adj Close'].isnull().all(): df_ticker = all_stock_data[['Close']].copy();
            if not (df_ticker.empty or df_ticker['Close'].isnull().all()): print(f"  -> Warning: Using 'Close' for {ticker}.")
            else: print(f"  -> Warning: No valid price for {ticker}. Skipping.")
            if not df_ticker.empty and not df_ticker.isnull().all().all(): df_ticker.columns = [ticker]; price_data_list.append(df_ticker); available_tickers_in_download.append(ticker)
    else:
        if isinstance(all_stock_data.columns, pd.MultiIndex):
             valid_tickers = all_stock_data.columns.get_level_values(0).unique().tolist()
             for ticker in tickers_to_download:
                  if ticker in valid_tickers:
                    try:
                        df_ticker = all_stock_data[ticker][['Adj Close']].copy()
                        if df_ticker.empty or df_ticker['Adj Close'].isnull().all():
                            df_ticker = all_stock_data[ticker][['Close']].copy()
                            if not (df_ticker.empty or df_ticker['Close'].isnull().all()): print(f"  -> Warning: Using 'Close' for {ticker}.")
                            else: print(f"  -> Warning: No valid price for {ticker}. Skipping."); continue
                        if not df_ticker.empty and not df_ticker.isnull().all().all(): df_ticker.columns = [ticker]; price_data_list.append(df_ticker); available_tickers_in_download.append(ticker)
                    except KeyError: print(f"  -> Warning: Data for {ticker} not in MultiIndex.")
                  else: print(f"  -> Warning: Ticker {ticker} not in yfinance result.")
        else: raise TypeError(f"Unexpected yfinance structure: {type(all_stock_data)}")
    if not price_data_list: raise ValueError("No valid price data collected.")
    price_data = pd.concat(price_data_list, axis=1); price_data = price_data.ffill().bfill().dropna(axis=1, how='all')
    if price_data.empty: raise ValueError("Price data empty after cleaning.")
    tickers_available_yf = sorted(list(price_data.columns)); print(f"  -> Stock price data processed for {len(tickers_available_yf)} tickers: {tickers_available_yf}")
    price_data.index = pd.to_datetime(price_data.index); monthly_prices = price_data.resample('ME').last()
    stock_monthly_returns = monthly_prices.pct_change()
    buffer_start_date = (pd.to_datetime(START_DATE_ANALYSIS) - pd.DateOffset(months=ESG_LAG_MONTHS + 2))
    stock_monthly_returns = stock_monthly_returns.loc[buffer_start_date:END_DATE_PRICES]
    if stock_monthly_returns.empty or stock_monthly_returns.isnull().all().all(): raise ValueError("Monthly returns empty/all NaN after date filtering.")
    print(f"  -> Stock monthly returns prepared: {stock_monthly_returns.index.min().date()} to {stock_monthly_returns.index.max().date()}")
except Exception as e: print(f" FATAL ERROR processing stock returns: {e}"); traceback.print_exc(); sys.exit()

# ==============================================================================
# --- Step 2: Load and Prepare Factors Data ---
# ==============================================================================
print("\n--- 2. Loading and Preparing Factors Data ---")
ff_factors_monthly = pd.DataFrame(); rf_col = None; available_factors_list = []
try:
    try:
        ff_factors_monthly_raw = pd.read_csv(FF_FACTORS_PATH, index_col=0)
    except FileNotFoundError: raise ValueError(f"Factor file not found: '{FF_FACTORS_PATH}'.")
    except IndexError: raise ValueError(f"Factor file '{FF_FACTORS_PATH}' might be empty or lack an index column.")
    except Exception as e: raise ValueError(f"Could not read factor file '{FF_FACTORS_PATH}': {e}")

    ff_factors_monthly_raw.columns = [col.strip().lower().replace('-', '_').replace('.', '') for col in ff_factors_monthly_raw.columns]

    if not pd.api.types.is_datetime64_any_dtype(ff_factors_monthly_raw.index):
        original_index_name = ff_factors_monthly_raw.index.name
        print(f"  -> Attempting to convert factor index '{original_index_name}' to datetime...")
        original_index_values = ff_factors_monthly_raw.index
        converted = False
        try:
            ff_factors_monthly_raw.index = pd.to_datetime(original_index_values, errors='coerce')
            if not ff_factors_monthly_raw.index.isna().all():
                print("    -> Converted index using pandas default.")
                converted = True
        except Exception:
            ff_factors_monthly_raw.index = original_index_values
            print("    -> Pandas default parser failed or encountered an error.")

        if not converted:
            print("    -> Trying specific formats...")
            date_formats_to_try = ["%m/%d/%y", "%Y%m%d", "%Y%m", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]
            for fmt in date_formats_to_try:
                try:
                    converted_index = pd.to_datetime(original_index_values, errors='coerce', format=fmt)
                    if not converted_index.isna().all() and (original_index_values.astype(str).str.strip() != '').any():
                         ff_factors_monthly_raw.index = converted_index
                         print(f"    -> Converted index using format: '{fmt}'.")
                         converted = True; break
                except Exception:
                     continue
        if not converted:
            problematic_indices = original_index_values[pd.to_datetime(original_index_values, errors='coerce').isna()].tolist()
            raise ValueError(f"Failed to convert factor index. Problematic values start with: {problematic_indices[:10]}... Inspect '{original_index_name}' in '{FF_FACTORS_PATH}'.")

        invalid_dates_mask = pd.isna(ff_factors_monthly_raw.index)
        if invalid_dates_mask.any():
            num_invalid = invalid_dates_mask.sum(); print(f"    -> Warning: Dropped {num_invalid} rows with NaT index.")
            ff_factors_monthly_raw = ff_factors_monthly_raw[~invalid_dates_mask]
        ff_factors_monthly_raw.index.name = original_index_name

    if ff_factors_monthly_raw.index.empty: raise ValueError("Factor index empty after cleaning.")
    ff_factors_monthly_raw.index = ff_factors_monthly_raw.index + pd.offsets.MonthEnd(0); ff_factors_monthly_raw = ff_factors_monthly_raw[~ff_factors_monthly_raw.index.duplicated(keep='last')]
    print("  -> Converting factor columns to numeric...")
    for col in ff_factors_monthly_raw.columns:
        if col != ff_factors_monthly_raw.index.name: ff_factors_monthly_raw[col] = pd.to_numeric(ff_factors_monthly_raw[col], errors='coerce')
    buffer_start_date = (pd.to_datetime(START_DATE_ANALYSIS) - pd.DateOffset(months=ESG_LAG_MONTHS + 2))
    ff_factors_monthly_filtered = ff_factors_monthly_raw.loc[buffer_start_date:END_DATE_PRICES].copy()
    if ff_factors_monthly_filtered.empty: raise ValueError(f"No factor data in range ({buffer_start_date.date()} to {END_DATE_PRICES}).")
    if IMPUTE_DATA and ff_factors_monthly_filtered.isnull().any().any():
        print("  -> Imputing missing values in factors data..."); factor_numeric_cols = ff_factors_monthly_filtered.select_dtypes(include=np.number).columns
        if not factor_numeric_cols.empty:
             ff_factors_monthly = advanced_imputation(ff_factors_monthly_filtered)
             if ff_factors_monthly is ff_factors_monthly_filtered: print("    -> Warning: Factor imputation failed/skipped."); ff_factors_monthly = ff_factors_monthly_filtered.copy()
             elif ff_factors_monthly[factor_numeric_cols].isnull().any().any(): print("    -> Warning: NaNs may remain post-imputation.")
        else: print("    -> No numeric factors found for imputation."); ff_factors_monthly = ff_factors_monthly_filtered.copy()
    else: ff_factors_monthly = ff_factors_monthly_filtered.copy();
    if not IMPUTE_DATA: print("  -> Imputation disabled for factors.");
    elif not ff_factors_monthly_filtered.isnull().any().any(): print("  -> No missing factors detected.")
    factor_cols_to_check = ["mkt_rf", "smb", "hml", "rmw", "cma", "rf", "mom"]
    print("  -> Checking factor scales...")
    for col in factor_cols_to_check:
        if col in ff_factors_monthly.columns and pd.api.types.is_numeric_dtype(ff_factors_monthly[col]):
             median_abs_val = ff_factors_monthly[col].abs().median()
             if not pd.isna(median_abs_val) and median_abs_val > 1.0: ff_factors_monthly[col] /= 100.0; print(f"    -> Converting '{col}' to decimal.")
        elif col in ff_factors_monthly.columns: print(f"    -> Warning: Factor '{col}' not numeric.")
    print(f"  -> Columns available before RF check: {ff_factors_monthly.columns.tolist()}") # Debug Print
    rf_col_options = ['rf', 'risk_free_rate']; rf_col = next((col for col in rf_col_options if col in ff_factors_monthly.columns), None)
    if rf_col: print(f"  -> Using '{rf_col}' as RF.");
    if rf_col is None: raise ValueError(f"Critical Error: RF column ({rf_col_options}) not found in columns: {ff_factors_monthly.columns.tolist()}")
    elif ff_factors_monthly[rf_col].isnull().any(): print(f"  -> !!! WARNING: RF column ('{rf_col}') contains NaNs after processing!!!")

    factor_cols_check_for_factors = ["mkt_rf", "smb", "hml", "rmw", "cma", "mom"] # Exclude rf here
    available_factors_list = sorted([f for f in factor_cols_check_for_factors if f in ff_factors_monthly.columns and pd.api.types.is_numeric_dtype(ff_factors_monthly[f])])
    if not available_factors_list: print("  -> !!! WARNING: No standard factors found. !!!")
    else: print(f"  -> Available factors identified: {available_factors_list}")
except ValueError as ve: print(f" FATAL ERROR processing factors: {ve}"); sys.exit()
except Exception as e: print(f" FATAL ERROR processing factors: {e}"); traceback.print_exc(); sys.exit()

# ==============================================================================
# --- Step 3: Load and Prepare ESG Data from CSV ---
# ==============================================================================
print("\n--- 3. Loading and Preparing ESG Data from CSV ---")
esg_panel_raw = pd.DataFrame()
try:
    try: esg_data_loaded = pd.read_csv(ESG_DATA_PATH)
    except FileNotFoundError: raise ValueError(f"ESG file not found: '{ESG_DATA_PATH}'.")
    except Exception as e: raise ValueError(f"Could not read ESG file '{ESG_DATA_PATH}': {e}")
    print(f"  -> Raw ESG data loaded. Shape: {esg_data_loaded.shape}")
    original_cols = list(esg_data_loaded.columns); esg_data_loaded.columns = [re.sub(r'\s+', '_', col).replace('-', '_').replace('.', '').lower() for col in esg_data_loaded.columns]
    standardized_cols = list(esg_data_loaded.columns); print(f"  -> Standardized ESG columns: {standardized_cols}")
    required_esg_cols = ['date', 'ticker', 'esg_total_score', 'e_score', 's_score', 'g_score']
    missing_cols = [col for col in required_esg_cols if col not in esg_data_loaded.columns];
    if missing_cols: raise ValueError(f"ESG CSV missing required columns: {missing_cols}.")
    esg_data_loaded['date'] = pd.to_datetime(esg_data_loaded['date'], errors='coerce')
    initial_rows = len(esg_data_loaded); esg_data_loaded = esg_data_loaded.dropna(subset=['date'])
    if len(esg_data_loaded) < initial_rows: print(f"  -> Warning: Dropped {initial_rows - len(esg_data_loaded)} rows due to invalid ESG dates.")
    if esg_data_loaded.empty: raise ValueError("ESG data empty after removing invalid dates.")
    score_cols_std = ['esg_total_score', 'e_score', 's_score', 'g_score']; print(f"  -> Converting score columns to numeric: {score_cols_std}")
    non_numeric_issues = False
    for col in score_cols_std:
        initial_nan_count = esg_data_loaded[col].isnull().sum(); esg_data_loaded[col] = pd.to_numeric(esg_data_loaded[col], errors='coerce'); final_nan_count = esg_data_loaded[col].isnull().sum()
        if final_nan_count > initial_nan_count: num_coerced = final_nan_count - initial_nan_count; print(f"    -> CRITICAL WARNING: Column '{col}' had {num_coerced} non-numeric values converted to NaN."); non_numeric_issues = True
    if non_numeric_issues and not IMPUTE_DATA: raise ValueError(f"Non-numeric ESG scores found and imputation disabled. Clean source CSV.")
    elif non_numeric_issues: print("    -> Imputation will attempt to handle NaNs from non-numeric scores.")
    esg_data_loaded['ticker'] = esg_data_loaded['ticker'].astype(str).str.upper().str.strip()
    stock_tickers_upper = [t.upper().strip() for t in tickers_available_yf]; esg_tickers = esg_data_loaded['ticker'].unique()
    common_tickers = sorted(list(set(stock_tickers_upper) & set(esg_tickers)));
    if not common_tickers: raise ValueError("No common tickers found between stock and ESG data.")
    print(f"  -> Common tickers identified: {common_tickers} ({len(common_tickers)} firms)")
    esg_only = sorted(list(set(esg_tickers) - set(stock_tickers_upper))); stock_only = sorted(list(set(stock_tickers_upper) - set(esg_tickers)))
    if esg_only: print(f"    -> Tickers in ESG only: {esg_only}")
    if stock_only: print(f"    -> Tickers in Stock only: {stock_only}")
    esg_data_filtered = esg_data_loaded[esg_data_loaded['ticker'].isin(common_tickers)].copy()
    buffer_start_date = (pd.to_datetime(START_DATE_ANALYSIS) - pd.DateOffset(months=ESG_LAG_MONTHS + 2))
    esg_filter_end_date = pd.to_datetime(END_DATE_ANALYSIS) + pd.offsets.MonthEnd(0)
    esg_data_filtered = esg_data_filtered[(esg_data_filtered['date'] >= buffer_start_date) & (esg_data_filtered['date'] <= esg_filter_end_date)]
    if esg_data_filtered.empty: raise ValueError("No ESG data remains after filtering.")
    esg_data_filtered['date'] = esg_data_filtered['date'] + pd.offsets.MonthEnd(0); esg_data_filtered = esg_data_filtered.sort_values(by=['ticker', 'date']).drop_duplicates(subset=['ticker', 'date'], keep='last')
    panel_start_date = esg_data_filtered['date'].min(); panel_end_date = esg_data_filtered['date'].max(); print(f"  -> Creating ESG panel from {panel_start_date.date()} to {panel_end_date.date()}")
    full_date_range = pd.date_range(start=panel_start_date, end=panel_end_date, freq='ME'); multi_index = pd.MultiIndex.from_product([common_tickers, full_date_range], names=['Ticker', 'Date'])
    esg_panel_raw = esg_data_filtered.set_index(['ticker', 'date'])[score_cols_std].reindex(multi_index); print(f"  -> Forward-filling ESG scores...")
    esg_panel_raw[score_cols_std] = esg_panel_raw.groupby(level='Ticker')[score_cols_std].ffill()
    if esg_panel_raw[score_cols_std].isnull().values.any(): nan_counts = esg_panel_raw[score_cols_std].isnull().sum(); print(f"  -> !!! WARNING: NaNs remain after ffill. Imputation will attempt. Counts:\n{nan_counts[nan_counts > 0]}")
    else: print("  -> No NaNs detected post-ffill.")
    esg_panel_raw = esg_panel_raw.reset_index(); print(f"  -> ESG panel structure created. Shape: {esg_panel_raw.shape}")
except Exception as e: print(f" FATAL ERROR processing ESG data: {e}"); traceback.print_exc(); sys.exit()

# ==============================================================================
# --- Step 4: Merge Data, Add Categories, Lag ESG, Prepare Panel ---
# ==============================================================================
print("\n--- 4. Merging Data, Add Categories, Lag ESG, and Final Prep ---")
panel_data_imputed_main = pd.DataFrame(); panel_data_no_imputation = pd.DataFrame(); initial_missing_stats = {}
final_panel_data = pd.DataFrame(); lagged_category_col = None; available_factors = []; available_lagged_esg_scores = []; final_tickers = []
try:
    tickers_final_list = common_tickers; print(f"  -> Preparing data for {len(tickers_final_list)} common tickers.")
    stock_monthly_returns.columns = [col.upper().strip() for col in stock_monthly_returns.columns]
    stock_cols_to_use = [col for col in stock_monthly_returns.columns if col in tickers_final_list]
    stock_monthly_returns_analysis = stock_monthly_returns[stock_cols_to_use].loc[START_DATE_ANALYSIS:END_DATE_ANALYSIS]
    if stock_monthly_returns_analysis.empty or stock_monthly_returns_analysis.isnull().all().all(): raise ValueError("Stock returns empty after filtering.")
    print(f"  -> Stock returns filtered: {stock_monthly_returns_analysis.index.min().date()} to {stock_monthly_returns_analysis.index.max().date()}")
    returns_long = stock_monthly_returns_analysis.stack(dropna=False).reset_index(); returns_long.columns = ['Date', 'Ticker', 'Return']; returns_long['Date'] = pd.to_datetime(returns_long['Date'])
    if not pd.api.types.is_datetime64_any_dtype(ff_factors_monthly.index): ff_factors_monthly.index = pd.to_datetime(ff_factors_monthly.index)
    factors_analysis = ff_factors_monthly.loc[START_DATE_ANALYSIS:END_DATE_ANALYSIS].copy()
    if factors_analysis.empty: raise ValueError("Factors empty after filtering.")
    factors_analysis = factors_analysis.reset_index()
    date_col_name_factors = factors_analysis.columns[0]; factors_analysis = factors_analysis.rename(columns={date_col_name_factors: 'Date'}); factors_analysis['Date'] = pd.to_datetime(factors_analysis['Date'])
    print(f"  -> Factors filtered: {factors_analysis['Date'].min().date()} to {factors_analysis['Date'].max().date()}")
    esg_panel_analysis = esg_panel_raw.copy(); esg_risk_categories_upper = {k.upper().strip(): v for k, v in esg_risk_categories.items()}
    esg_panel_analysis['ESG_Category'] = esg_panel_analysis['Ticker'].map(esg_risk_categories_upper)
    unknown_category_tickers = esg_panel_analysis[esg_panel_analysis['ESG_Category'].isnull()]['Ticker'].unique()
    if len(unknown_category_tickers) > 0: esg_panel_analysis['ESG_Category'].fillna('Unknown', inplace=True); print(f"  -> Warn: Tickers mapped to 'Unknown' ESG category: {list(unknown_category_tickers)}")
    print(f"  -> Lagging ESG scores and category by {ESG_LAG_MONTHS} month(s)..."); esg_cols_to_lag = score_cols_std + ['ESG_Category']
    lagged_col_names = {col: f"{col}_lag{ESG_LAG_MONTHS}" for col in score_cols_std}; lagged_category_col_name = f"ESG_Category_lag{ESG_LAG_MONTHS}"; lagged_col_names['ESG_Category'] = lagged_category_col_name
    esg_panel_lagged = esg_panel_analysis.sort_values(by=['Ticker', 'Date']).copy(); esg_panel_lagged[list(lagged_col_names.values())] = esg_panel_lagged.groupby('Ticker')[esg_cols_to_lag].shift(ESG_LAG_MONTHS)
    esg_panel_lagged = esg_panel_lagged[['Date', 'Ticker'] + list(lagged_col_names.values())]; esg_panel_lagged['Date'] = pd.to_datetime(esg_panel_lagged['Date'])
    print("  -> Merging Returns, Factors, and Lagged ESG data..."); panel_data = pd.merge(returns_long, factors_analysis, on='Date', how='left'); panel_data = pd.merge(panel_data, esg_panel_lagged, on=['Date', 'Ticker'], how='left')
    print(f"  -> Rows after merging: {len(panel_data)}");
    if panel_data.empty: raise ValueError("Panel data empty after merging.")
    if rf_col not in panel_data.columns: raise ValueError(f"RF column '{rf_col}' missing.");
    if 'Return' not in panel_data.columns: raise ValueError("'Return' column missing.");

    # *** Ensure Return and RF columns are numeric BEFORE calculating ExcessReturn ***
    print(f"  -> Checking data types before ExcessReturn calculation: Return={panel_data['Return'].dtype}, {rf_col}={panel_data[rf_col].dtype}")
    panel_data['Return'] = pd.to_numeric(panel_data['Return'], errors='coerce')
    panel_data[rf_col] = pd.to_numeric(panel_data[rf_col], errors='coerce')
    # Check if coercion introduced NaNs
    if panel_data['Return'].isnull().any(): print(f"    -> Warning: NaNs introduced in 'Return' column after converting to numeric.")
    if panel_data[rf_col].isnull().any(): print(f"    -> Warning: NaNs introduced in '{rf_col}' column after converting to numeric.")

    # Now calculate ExcessReturn
    panel_data['ExcessReturn'] = panel_data['Return'] - panel_data[rf_col]

    # Check calculation result
    print(f"  -> 'ExcessReturn' calculated. Check presence: {'ExcessReturn' in panel_data.columns}.")
    if 'ExcessReturn' in panel_data.columns: print(f"     NaN count in ExcessReturn: {panel_data['ExcessReturn'].isnull().sum()}")
    else: raise ValueError("CRITICAL ERROR: 'ExcessReturn' column still NOT FOUND after calculation and type checks!") # Raise error if still missing

    available_factors = sorted([f for f in available_factors_list if f in panel_data.columns]); available_lagged_esg_scores = sorted([col for col in lagged_col_names.values() if col != lagged_category_col_name and col in panel_data.columns])
    lagged_category_col = lagged_category_col_name if lagged_category_col_name in panel_data.columns else None
    print(f"  -> Available Factors: {available_factors}"); print(f"  -> Available Lagged ESG Scores: {available_lagged_esg_scores}"); print(f"  -> Lagged ESG Category column: {lagged_category_col}")
    print("  -> Assessing Missing Values (% before imputation/dropping):"); missing_count = 0; essential_cols_check = ['ExcessReturn'] + available_factors + available_lagged_esg_scores + ([lagged_category_col] if lagged_category_col else [])
    for col in essential_cols_check:
        if col in panel_data:
            missing_pct = panel_data[col].isnull().mean() * 100;
            if missing_pct > 0: num_missing = panel_data[col].isnull().sum(); print(f"    - {col}: {missing_pct:.1f}% ({num_missing} missing)"); initial_missing_stats[col] = missing_pct; missing_count += 1
    if missing_count == 0: print("    -> No missing values detected.")
    if any(pct > 25 for pct in initial_missing_stats.values()): print("  -> !!! RELIABILITY WARNING: High initial missingness (>25%) found. !!!")
    panel_data_for_main_analysis = pd.DataFrame() # Initialize
    if IMPUTE_DATA:
        print("  -> Preparing main dataset WITH IMPUTATION (MICE)..."); panel_data_imputed_temp = panel_data.copy()
        # Check before imputation step
        if 'ExcessReturn' not in panel_data_imputed_temp.columns: raise ValueError("Cannot proceed with imputation: 'ExcessReturn' is missing.")
        cols_to_impute = [ c for c in ['Return', rf_col, 'ExcessReturn'] + available_factors + available_lagged_esg_scores if c in panel_data_imputed_temp.columns and pd.api.types.is_numeric_dtype(panel_data_imputed_temp[c]) and panel_data_imputed_temp[c].isnull().any() ]
        if cols_to_impute:
            print(f"    -> Columns identified for imputation: {cols_to_impute}"); panel_data_imputed_result = advanced_imputation(panel_data_imputed_temp)
            # Always check if ExcessReturn needs recalculation after imputation if source columns were imputed
            if ('Return' in cols_to_impute or rf_col in cols_to_impute) and \
               ('Return' in panel_data_imputed_result.columns and rf_col in panel_data_imputed_result.columns):
                 print("    -> Recalculating ExcessReturn post-imputation."); panel_data_imputed_result['ExcessReturn'] = panel_data_imputed_result['Return'] - panel_data_imputed_result[rf_col]
            # Ensure ExcessReturn is still present after imputation/recalculation
            if 'ExcessReturn' not in panel_data_imputed_result.columns: raise ValueError("CRITICAL ERROR: 'ExcessReturn' lost during/after imputation.")
            panel_data_for_main_analysis = panel_data_imputed_result
        else: print("    -> No numeric columns needed imputation."); panel_data_for_main_analysis = panel_data_imputed_temp
    else: print("  -> IMPUTATION DISABLED."); panel_data_for_main_analysis = panel_data.copy()
    print(" -> Defining essential columns for final main dataset..."); essential_cols_final = ['ExcessReturn'] + available_factors + available_lagged_esg_scores + ([lagged_category_col] if lagged_category_col else [])
    essential_cols_final = [c for c in essential_cols_final if c in panel_data_for_main_analysis.columns]; print(f"    -> Essential columns check: {essential_cols_final}")
    if 'ExcessReturn' not in panel_data_for_main_analysis.columns: raise ValueError("Critical Error: 'ExcessReturn' missing before final NaN dropping.")
    if 'ExcessReturn' not in essential_cols_final: raise ValueError("Critical Error: 'ExcessReturn' missing from essential columns list.")
    initial_rows_before_drop = len(panel_data_for_main_analysis); panel_data_imputed_main = panel_data_for_main_analysis.dropna(subset=essential_cols_final); dropped_rows = initial_rows_before_drop - len(panel_data_imputed_main)
    if dropped_rows > 0: print(f"    -> Dropped {dropped_rows} rows due to NaNs in essential columns.")
    print(f"  -> Main panel rows after NaN drop: {len(panel_data_imputed_main)}")
    if RUN_WITHOUT_IMPUTATION_SENSITIVITY:
        print("  -> Preparing sensitivity dataset (NO IMPUTATION)..."); essential_cols_sens = ['ExcessReturn'] + available_factors + available_lagged_esg_scores + ([lagged_category_col] if lagged_category_col else [])
        essential_cols_sens = [c for c in panel_data.columns if c in essential_cols_sens] # Check against original panel_data columns
        if 'ExcessReturn' not in essential_cols_sens: print("   -> Warning: 'ExcessReturn' not found for sensitivity check.")
        initial_rows_sens = len(panel_data); panel_data_no_imputation = panel_data.dropna(subset=essential_cols_sens); dropped_rows_sens = initial_rows_sens - len(panel_data_no_imputation)
        if dropped_rows_sens > 0: print(f"    -> Dropped {dropped_rows_sens} rows for sensitivity dataset."); print(f"  -> Sensitivity panel rows (NaNs dropped): {len(panel_data_no_imputation)}")
        if panel_data_no_imputation.empty: print("    -> !!! WARNING: Sensitivity dataset empty. !!!")
    final_panel_data = panel_data_imputed_main.copy();
    if final_panel_data.empty: raise ValueError("Main panel data empty.")
    if lagged_category_col and lagged_category_col in final_panel_data.columns and 'Unknown' in final_panel_data[lagged_category_col].unique():
        print(f"  -> Filtering 'Unknown' category from main panel..."); initial_rows_main = len(final_panel_data); final_panel_data = final_panel_data[final_panel_data[lagged_category_col] != 'Unknown'].copy(); filtered_count = initial_rows_main - len(final_panel_data)
        print(f"     -> Removed {filtered_count} rows. Main rows remaining: {len(final_panel_data)}")
    if final_panel_data.empty: raise ValueError("Main panel empty after filtering 'Unknown'.")
    if RUN_WITHOUT_IMPUTATION_SENSITIVITY and not panel_data_no_imputation.empty:
        if lagged_category_col and lagged_category_col in panel_data_no_imputation.columns and 'Unknown' in panel_data_no_imputation[lagged_category_col].unique():
            print(f"  -> Filtering 'Unknown' category from sensitivity panel..."); initial_rows_sens = len(panel_data_no_imputation); panel_data_no_imputation = panel_data_no_imputation[panel_data_no_imputation[lagged_category_col] != 'Unknown'].copy(); filtered_count_sens = initial_rows_sens - len(panel_data_no_imputation)
            print(f"     -> Removed {filtered_count_sens} rows. Sensitivity rows remaining: {len(panel_data_no_imputation)}")
            if panel_data_no_imputation.empty: print("    -> !!! WARNING: Sensitivity dataset empty after filtering 'Unknown'. !!!")
    if not {'Ticker', 'Date'}.issubset(final_panel_data.columns): raise ValueError("Ticker/Date missing before setting index.")
    final_panel_data['Date'] = pd.to_datetime(final_panel_data['Date']); final_panel_data = final_panel_data.set_index(['Ticker', 'Date']).sort_index()
    available_factors = sorted([f for f in available_factors if f in final_panel_data.columns]); available_lagged_esg_scores = sorted([e for e in available_lagged_esg_scores if e in final_panel_data.columns])
    lagged_category_col = lagged_category_col if lagged_category_col and lagged_category_col in final_panel_data.columns else None; final_tickers = sorted(final_panel_data.index.get_level_values('Ticker').unique().tolist())
    if 'ExcessReturn' not in final_panel_data.columns: raise ValueError("CRITICAL ERROR: 'ExcessReturn' column is definitively missing from final_panel_data before Step 5.")
    print(f"\n  -> Final Main Panel Ready:"); print(f"     Observations: {len(final_panel_data)}"); print(f"     Entities: {len(final_tickers)} {final_tickers}")
    time_periods = final_panel_data.index.get_level_values('Date').nunique(); min_date = final_panel_data.index.get_level_values('Date').min().date(); max_date = final_panel_data.index.get_level_values('Date').max().date()
    print(f"     Time Periods: {time_periods} ({min_date} to {max_date})"); print(f"     Panel Balanced: {final_panel_data.index.is_unique and len(final_panel_data) == len(final_tickers) * time_periods}")
    print(f"     Available Factors: {available_factors}"); print(f"     Available Lagged ESG Scores: {available_lagged_esg_scores}"); print(f"     Lagged ESG Category Column: {lagged_category_col}")
    if lagged_category_col: category_counts = final_panel_data[lagged_category_col].value_counts(); print(f"     Category Counts:\n{category_counts.to_string()}"); small_cats = category_counts[category_counts < 30];
    if lagged_category_col and not small_cats.empty: print(f"    -> WARN: Small categories (<30 obs): {small_cats.to_dict()}.")
    if RUN_WITHOUT_IMPUTATION_SENSITIVITY and not panel_data_no_imputation.empty:
         if {'Ticker', 'Date'}.issubset(panel_data_no_imputation.columns): panel_data_no_imputation['Date'] = pd.to_datetime(panel_data_no_imputation['Date']); panel_data_no_imputation = panel_data_no_imputation.set_index(['Ticker', 'Date']).sort_index(); print(f"  -> Sensitivity panel ready (No Imputation): {len(panel_data_no_imputation)} obs")
         else: print("    -> Warning: Could not set index for sensitivity data.")
except Exception as e: print(f" FATAL ERROR during Data Prep (Step 4): {e}"); traceback.print_exc(); sys.exit()

# ==============================================================================
# --- Step 5: Check for Multicollinearity (VIF) ---
# ==============================================================================
print("\n--- 5. Checking for Multicollinearity (VIF) ---")
vif_results_total = None; vif_results_components = None
def calculate_vif(data, predictors, model_name="VIF Check"):
    print(f"\n  Calculating VIF for: {model_name}")
    if not predictors: print("    -> No predictors."); return None, []
    predictors_in_data = [p for p in predictors if p in data.columns]; missing = [p for p in predictors if p not in data.columns]
    if missing: print(f"    -> Warning: Predictors missing for VIF: {missing}")
    if not predictors_in_data: print("    -> No valid predictors."); return None, []
    X = data[predictors_in_data].copy(); initial_rows = len(X); X = X.dropna(); dropped_rows = initial_rows - len(X)
    if dropped_rows > 0: print(f"    -> Dropped {dropped_rows} rows with NaNs for VIF.")
    if X.empty or len(X) < 2 or X.shape[1] < 1: print(f"    -> Not enough data for VIF."); return None, predictors_in_data
    constant_cols = X.columns[X.nunique() <= 1].tolist()
    if constant_cols: print(f"    -> Warning: Constant columns removed for VIF: {constant_cols}"); X = X.drop(columns=constant_cols); predictors_in_data = X.columns.tolist();
    if X.empty or X.shape[1] < 1: print(f"    -> No variables left after removing constant cols."); return None, predictors_in_data
    try: X_vif = sm.add_constant(X, prepend=True, has_constant='skip')
    except Exception as e: print(f"    -> Error adding constant: {e}."); return None, predictors_in_data
    if not np.all(np.isfinite(X_vif.values)): print("    -> Warning: Non-finite values detected. Attempting removal..."); X_vif = X_vif.replace([np.inf, -np.inf], np.nan).dropna();
    if X_vif.empty: print("    -> Error: Data empty after removing non-finite."); return None, predictors_in_data
    try:
        vif_data = pd.DataFrame(); vif_data["Variable"] = [col for col in X_vif.columns if col.lower() != 'const']
        vif_values = [variance_inflation_factor(X_vif.values.astype(float), i) for i, col in enumerate(X_vif.columns) if col.lower() != 'const']
        vif_data["VIF"] = vif_values; print(vif_data.sort_values('VIF', ascending=False).to_string(index=False))
        high_vif_vars = vif_data[vif_data["VIF"] > VIF_THRESHOLD]["Variable"].tolist()
        if high_vif_vars: print(f"    -> !!! WARNING: High VIF (> {VIF_THRESHOLD}): {high_vif_vars}. !!!")
        else: print(f"    -> VIF check passed (threshold={VIF_THRESHOLD}).")
        return vif_data, predictors_in_data
    except (np.linalg.LinAlgError, ValueError) as vif_calc_err: print(f"    -> VIF calc failed: {vif_calc_err} (Perfect multicollinearity likely)."); return None, predictors_in_data
    except Exception as e: print(f"    -> VIF error: {e}"); traceback.print_exc(); return None, predictors_in_data

primary_esg_var = f'esg_total_score_lag{ESG_LAG_MONTHS}' if f'esg_total_score_lag{ESG_LAG_MONTHS}' in final_panel_data.columns else None
component_esg_vars = [col for col in [f'e_score_lag{ESG_LAG_MONTHS}', f's_score_lag{ESG_LAG_MONTHS}', f'g_score_lag{ESG_LAG_MONTHS}'] if col in final_panel_data.columns]
if primary_esg_var:
    predictors_total_vif = available_factors + [primary_esg_var]; predictors_total_vif = [p for p in predictors_total_vif if p in final_panel_data.columns]
    if predictors_total_vif: vif_results_total, _ = calculate_vif(final_panel_data.reset_index(), predictors_total_vif, "Factors + Total ESG")
    else: print(" -> Skipping VIF (Total ESG): No valid predictors.")
else: print("  -> Skipping VIF (Total ESG): Primary ESG var missing.")
if component_esg_vars:
    predictors_components_vif = available_factors + component_esg_vars; predictors_components_vif = [p for p in predictors_components_vif if p in final_panel_data.columns]
    if predictors_components_vif: vif_results_components, _ = calculate_vif(final_panel_data.reset_index(), predictors_components_vif, "Factors + ESG Components")
    else: print(" -> Skipping VIF (Components): No valid predictors.")
else: print("  -> Skipping VIF (Components): ESG component vars missing.")
if primary_esg_var: print(f"\n  -> Primary ESG var for models: '{primary_esg_var}'")
if component_esg_vars: print(f"  -> Component ESG vars: {component_esg_vars}")
if not primary_esg_var and not component_esg_vars: print("\n!!! WARNING: No lagged ESG vars available. !!!")

# ==============================================================================
# --- Step 6: Panel Regression Analysis ---
# ==============================================================================
print("\n--- 6. Panel Regression Analysis (Main Run - Imputed Data if Enabled) ---")
regression_results = {}; model_summaries = {}; sensitivity_regression_results = {}; model_formulas_used = {}
sensitivity_summaries = {}; sensitivity_formulas_used = {} # Separate dicts for sensitivity

formula_pooled_interaction = None; formula_fe_re_simple = None
if primary_esg_var and available_factors:
    base_factors_str = ' + '.join(available_factors); esg_term = primary_esg_var
    formula_fe_re_simple = f"ExcessReturn ~ 1 + {base_factors_str} + {esg_term}"; print(f"\n  -> Formula for FE & RE models: {formula_fe_re_simple}")
    if lagged_category_col and lagged_category_col in final_panel_data.columns and final_panel_data[lagged_category_col].nunique() > 1:
        preferred_reference_category = 'Middle'; print(f"  -> Attempting to set '{preferred_reference_category}' as reference category for Pooled OLS.")
        available_cats = [c for c in final_panel_data[lagged_category_col].unique() if isinstance(c, str)]; final_reference_category = None
        if preferred_reference_category in available_cats: final_reference_category = preferred_reference_category; print(f"     -> Found exact match: Using '{final_reference_category}'.")
        else:
            ref_cat_lower = preferred_reference_category.lower(); matching_cats = [c for c in available_cats if c.lower() == ref_cat_lower]
            if matching_cats: final_reference_category = matching_cats[0]; print(f"     -> Found case-insensitive match: Using '{final_reference_category}'.")
            elif available_cats:
                 most_frequent_cat = final_panel_data[lagged_category_col].mode()
                 if not most_frequent_cat.empty: final_reference_category = most_frequent_cat[0]; print(f"     -> '{preferred_reference_category}' not found. Using most frequent category '{final_reference_category}' as fallback reference.")
                 else: print(f"     -> Warning: Could not determine most frequent category. Cannot create Pooled OLS interaction formula.")
            else: print(f"     -> Warning: No valid string categories found. Cannot create Pooled OLS interaction formula.")
        if final_reference_category: interaction_term = f"{esg_term} * C({lagged_category_col}, Treatment(reference='{final_reference_category}'))"; formula_pooled_interaction = f"ExcessReturn ~ 1 + {base_factors_str} + {interaction_term}"; print(f"  -> Formula for Pooled OLS (Interaction): {formula_pooled_interaction}")
        else: print(f"  -> Warn: Could not determine reference category. Using simple formula for Pooled OLS."); formula_pooled_interaction = formula_fe_re_simple
    else:
        if not lagged_category_col or lagged_category_col not in final_panel_data.columns: reason = "missing"
        else: reason = "has <= 1 unique value"
        print(f"  -> Warn: Category column ('{lagged_category_col}') {reason}. Using simple formula for Pooled OLS."); formula_pooled_interaction = formula_fe_re_simple
else: missing_info = [];
if not primary_esg_var: missing_info.append("primary ESG variable")
if not available_factors: missing_info.append("factor variables"); print(f"\n!!! CRITICAL WARNING: Cannot construct formulas (missing {', '.join(missing_info)}). Regression cannot proceed. !!!")

def run_panel_model(formula, model_type, model_key, data,
                    cov_config={'cov_type':'clustered', 'cluster_entity':True, 'cluster_time': False},
                    results_dict=None, summary_dict=None, formula_dict=None):
    """Fits panel model, handles errors, stores results/summaries."""
    print(f"\n  --- Fitting {model_key} ({model_type}) ---")
    if results_dict is None: results_dict = {}
    if summary_dict is None: summary_dict = {}
    if formula_dict is None: formula_dict = {}
    results = None; error_msg = None; formula_status = "Attempted"
    if not formula: error_msg = "Skipped: No formula."; formula_status = "Skipped - No Formula"
    elif data is None or data.empty: error_msg = "Skipped: Empty data."; formula_status = "Skipped - Empty Data"
    if error_msg: print(f"    -> {error_msg}"); summary_dict[model_key] = error_msg; results_dict[model_key] = None; formula_dict[model_key] = formula_status; return
    try:
        print(f"    Using Formula: {formula}")
        dep, exog_formula = formula.split('~', 1); dep = dep.strip(); exog_formula = exog_formula.strip()
        final_cov_config = cov_config.copy(); model = None; summary_obj = None
        if model_type == 'Pooled':
            pooled_cov_config = {'cov_type': 'robust'}; print("     (Note: Using robust covariance for Pooled OLS)")
            model = PanelOLS.from_formula(formula, data=data); final_cov_config = pooled_cov_config; formula_status = "Pooled Spec (Robust SE)"
        elif model_type == 'RE':
            model = RandomEffects.from_formula(formula, data=data)
            if 'cluster' not in cov_config.get('cov_type','robust').lower(): final_cov_config = {'cov_type': 'robust'}
            else: print("     (Note: Applying requested clustering to RE model)")
            formula_status = "RE Spec"
        elif model_type == 'FE_Entity':
            model = PanelOLS.from_formula(f"{dep} ~ {exog_formula} + EntityEffects", data=data, drop_absorbed=True); final_cov_config['cluster_time'] = False; formula_status = "FE Entity Spec"
        elif model_type == 'FE_TwoWay':
            model = PanelOLS.from_formula(f"{dep} ~ {exog_formula} + EntityEffects + TimeEffects", data=data, drop_absorbed=True); final_cov_config['cluster_time'] = True; formula_status = "FE TwoWay Spec"
        else: raise ValueError(f"Invalid model_type: {model_type}")
        results = model.fit(**final_cov_config); print(f"    -> Fit OK.")
        try:
             summary_obj = results.summary; print(f"    -> Summary generation OK.")
             summary_dict[model_key] = summary_obj
             if model_type.startswith('FE'):
                 summary_str = str(summary_obj); # Define summary_str here
                 if 'Absorbed' in summary_str or 'dropped' in summary_str.lower(): print("    -> Warning: Summary indicates potential variable absorption/dropping.")
        except (np.linalg.LinAlgError, ValueError) as summary_err: error_msg = f"Error: Summary failed - {type(summary_err).__name__}: {summary_err} (Singular matrix likely)."; print(f"    -> {error_msg}"); summary_dict[model_key] = error_msg; results_dict[model_key] = results; formula_dict[model_key] = formula_status + " (Summary Failed)"; return
        except Exception as gen_summary_err: error_msg = f"Error: Unexpected summary error - {type(gen_summary_err).__name__}: {gen_summary_err}"; print(f"    -> {error_msg}"); summary_dict[model_key] = error_msg; results_dict[model_key] = results; formula_dict[model_key] = formula_status + " (Summary Failed - Unknown)"; return
    except (ValueError, np.linalg.LinAlgError, PerfectSeparationError, ZeroDivisionError) as fit_err: error_msg = f"Error: Fit failed - {type(fit_err).__name__}: {fit_err}"; print(f"    -> {error_msg}")
    except Exception as e: error_msg = f"Error: Unexpected fit error - {type(e).__name__}: {e}"; print(f"    -> {error_msg}"); traceback.print_exc(limit=1)
    if results is not None and error_msg is None: results_dict[model_key] = results; formula_dict[model_key] = formula_status + " (Success)"
    elif results is not None and error_msg is not None: pass
    else: results_dict[model_key] = None; summary_dict[model_key] = error_msg; formula_dict[model_key] = formula_status + " (Fit Failed)"

# --- Run Main Models ---
if formula_pooled_interaction and formula_fe_re_simple:
    run_panel_model(formula_pooled_interaction, 'Pooled', 'Pooled_Interaction', final_panel_data, results_dict=regression_results, summary_dict=model_summaries, formula_dict=model_formulas_used)
    run_panel_model(formula_fe_re_simple, 'RE', 'RE_Simple', final_panel_data, results_dict=regression_results, summary_dict=model_summaries, formula_dict=model_formulas_used)
    run_panel_model(formula_fe_re_simple, 'FE_Entity', 'FE_Entity_Simple', final_panel_data, results_dict=regression_results, summary_dict=model_summaries, formula_dict=model_formulas_used)
    run_panel_model(formula_fe_re_simple, 'FE_TwoWay', 'FE_TwoWay_Simple', final_panel_data, results_dict=regression_results, summary_dict=model_summaries, formula_dict=model_formulas_used)
else: print("\n!!! Skipping main panel estimations: Missing required formulas. !!!")

# --- Run Sensitivity Analysis (No Imputation) ---
if RUN_WITHOUT_IMPUTATION_SENSITIVITY:
    print("\n--- 6b. Panel Regression Analysis (Sensitivity Run - NO IMPUTATION) ---")
    if panel_data_no_imputation is None or panel_data_no_imputation.empty:
        print("    -> Skipping Sensitivity: Non-imputed dataset empty.")
        sensitivity_regression_results['Pooled_Interaction_Sens'] = None; sensitivity_summaries['Pooled_Interaction_Sens'] = "Skipped - Empty Data"; sensitivity_formulas_used['Pooled_Interaction_Sens'] = "Skipped - Empty Data"
        sensitivity_regression_results['FE_Entity_Simple_Sens'] = None; sensitivity_summaries['FE_Entity_Simple_Sens'] = "Skipped - Empty Data"; sensitivity_formulas_used['FE_Entity_Simple_Sens'] = "Skipped - Empty Data"
    elif formula_pooled_interaction and formula_fe_re_simple:
        panel_data_no_imputation_indexed = None
        if isinstance(panel_data_no_imputation.index, pd.MultiIndex): panel_data_no_imputation_indexed = panel_data_no_imputation.copy()
        elif {'Ticker', 'Date'}.issubset(panel_data_no_imputation.columns):
             panel_data_no_imputation_idx_temp = panel_data_no_imputation.copy(); panel_data_no_imputation_idx_temp['Date'] = pd.to_datetime(panel_data_no_imputation_idx_temp['Date'])
             panel_data_no_imputation_indexed = panel_data_no_imputation_idx_temp.set_index(['Ticker', 'Date']).sort_index()
        if panel_data_no_imputation_indexed is not None:
            run_panel_model(formula_pooled_interaction, 'Pooled', 'Pooled_Interaction_Sens', panel_data_no_imputation_indexed, results_dict=sensitivity_regression_results, summary_dict=sensitivity_summaries, formula_dict=sensitivity_formulas_used)
            run_panel_model(formula_fe_re_simple, 'FE_Entity', 'FE_Entity_Simple_Sens', panel_data_no_imputation_indexed, results_dict=sensitivity_regression_results, summary_dict=sensitivity_summaries, formula_dict=sensitivity_formulas_used)
        else: print("    -> Error: Cannot set index for sensitivity data. Skipping."); sensitivity_summaries['Pooled_Interaction_Sens'] = "Skipped - Index Error"; sensitivity_summaries['FE_Entity_Simple_Sens'] = "Skipped - Index Error"; sensitivity_formulas_used['Pooled_Interaction_Sens'] = "Skipped - Index Error"; sensitivity_formulas_used['FE_Entity_Simple_Sens'] = "Skipped - Index Error"
    else: print("    -> Skipping Sensitivity: Missing required formulas."); sensitivity_summaries['Pooled_Interaction_Sens'] = "Skipped - No Formula"; sensitivity_summaries['FE_Entity_Simple_Sens'] = "Skipped - No Formula"; sensitivity_formulas_used['Pooled_Interaction_Sens'] = "Skipped - No Formula"; sensitivity_formulas_used['FE_Entity_Simple_Sens'] = "Skipped - No Formula"
else: print("\n--- Sensitivity Analysis Skipped (Configured Off). ---")

# ==============================================================================
# --- Step 7: Specification Tests & Interpretation ---
# ==============================================================================
print("\n--- 7. Specification Tests & Interpretation (using Main Run results) ---")
spec_test_results_list = []; preferred_model_key = None
fe_model = regression_results.get('FE_Entity_Simple'); re_model = regression_results.get('RE_Simple')
pooled_model = regression_results.get('Pooled_Interaction'); fe_tw_model = regression_results.get('FE_TwoWay_Simple')
is_fe_valid = isinstance(fe_model, PanelEffectsResults); is_re_valid = isinstance(re_model, RandomEffectsResults)
is_pooled_valid = isinstance(pooled_model, PanelEffectsResults); is_fe_tw_valid = isinstance(fe_tw_model, PanelEffectsResults)

try:
    print("\n    Comparing FE (Simple) vs RE (Simple) - Hausman Test:")
    if is_fe_valid and is_re_valid:
        try:
            common_params = list(set(fe_model.params.index) & set(re_model.params.index))
            if not common_params: print("      -> Skipping Hausman: No common parameters."); spec_test_results_list.append({'Test': 'Hausman (FE vs RE - Simple)', 'Details': 'No common parameters', 'P-value': '-', 'Conclusion': 'Cannot Run'})
            else:
                print("      -> Performing Hausman test via model comparison..."); comparison_fe_re = model_compare({"FE_Simple": fe_model, "RE_Simple": re_model})
                print(comparison_fe_re); hausman_pval_str = "See Table"; pval_num = np.nan
                try:
                    summary_str = str(comparison_fe_re); match = re.search(r"Hausman\s+([\d\.]+)", summary_str);
                    if match: pval_num = float(match.group(1)); hausman_pval_str = f"{pval_num:.4f}"
                except Exception as parse_err: print(f"       -> Warning: Could not parse Hausman p-value: {parse_err}")
                conclusion_hausman_test = 'Check Table';
                if not pd.isna(pval_num): conclusion_hausman_test = 'Prefer FE if Hausman p < 0.05'
                spec_test_results_list.append({'Test': 'Hausman (FE vs RE - Simple)', 'Details': 'Comparison table printed', 'P-value': hausman_pval_str, 'Conclusion': conclusion_hausman_test})
        except Exception as comp_e: print(f"      -> Error running Hausman comparison: {comp_e}"); spec_test_results_list.append({'Test': 'Hausman (FE vs RE - Simple)', 'Details': f"Error: {comp_e}", 'P-value': '-', 'Conclusion': 'Error'})
    else: details = "RE invalid" if is_fe_valid else "FE invalid" if is_re_valid else "Both invalid"; print(f"\n    Skipping Hausman test: {details}."); spec_test_results_list.append({'Test': 'Hausman (FE vs RE - Simple)', 'Details': details, 'P-value': '-', 'Conclusion': 'Cannot Run'})

    print("\n    F-test for Poolability (Entity Effects):")
    if is_fe_valid:
        try:
            if hasattr(fe_model, 'f_pooled'):
                f_pool = fe_model.f_pooled; stat_val = f_pool.stat; pval_val = f_pool.pval; df_num = getattr(f_pool, 'df_num', '?'); df_denom = getattr(f_pool, 'df_denom', '?')
                print(f"      F={stat_val:.4f}, P-value={pval_val:.4f} (df_num={df_num}, df_denom={df_denom})"); conclusion = 'Reject Pooling (Use FE)' if pval_val < 0.05 else 'Cannot Reject Pooling (Pooled OK)'
                spec_test_results_list.append({'Test': 'F-test (Poolability - Entity)', 'Details': f'F({df_num},{df_denom})={stat_val:.4f}', 'P-value': f'{pval_val:.4f}', 'Conclusion': conclusion})
            else: print("      -> Poolability F-stat (f_pooled) not available."); spec_test_results_list.append({'Test': 'F-test (Poolability - Entity)', 'Details': 'f_pooled unavailable', 'P-value': '-', 'Conclusion': 'Cannot Run'})
        except Exception as ftest_e: print(f"      -> Error accessing Poolability F-test: {ftest_e}"); spec_test_results_list.append({'Test': 'F-test (Poolability - Entity)', 'Details': f"Error: {ftest_e}", 'P-value': '-', 'Conclusion': 'Error'})
    else: print("      -> Skipping Poolability F-test: FE (Simple) model invalid."); spec_test_results_list.append({'Test': 'F-test (Poolability - Entity)', 'Details': 'FE Invalid', 'P-value': '-', 'Conclusion': 'Cannot Run'})

    print("\n    F-test for Time Effects:");
    if is_fe_tw_valid:
         try:
            if hasattr(fe_tw_model, 'f_test_time'):
                f_time = fe_tw_model.f_test_time; stat_val = f_time.stat; pval_val = f_time.pval; df_num = getattr(f_time, 'df_num', '?'); df_denom = getattr(f_time, 'df_denom', '?')
                print(f"      F={stat_val:.4f}, P-value={pval_val:.4f} (df_num={df_num}, df_denom={df_denom})"); conclusion = 'Time Effects Significant (Use Two-Way FE)' if pval_val < 0.05 else 'Time Effects Not Significant (Entity FE OK)'
                spec_test_results_list.append({'Test': 'F-test (Time Effects)', 'Details': f'F({df_num},{df_denom})={stat_val:.4f}', 'P-value': f'{pval_val:.4f}', 'Conclusion': conclusion})
            else: print("      -> Time Effects F-stat (f_test_time) not available."); spec_test_results_list.append({'Test': 'F-test (Time Effects)', 'Details': 'f_test_time unavailable', 'P-value': '-', 'Conclusion': 'Cannot Run'})
         except Exception as ftest_e: print(f"      -> Error accessing Time Effects F-test: {ftest_e}"); spec_test_results_list.append({'Test': 'F-test (Time Effects)', 'Details': f"Error: {ftest_e}", 'P-value': '-', 'Conclusion': 'Error'})
    else: print("      -> Skipping Time Effects F-test: Two-Way FE (Simple) model invalid."); spec_test_results_list.append({'Test': 'F-test (Time Effects)', 'Details': 'Two-Way FE Invalid', 'P-value': '-', 'Conclusion': 'Cannot Run'})
    spec_test_df = pd.DataFrame(spec_test_results_list)
except Exception as e: print(f"\nError during spec tests: {e}"); traceback.print_exc(); spec_test_df = pd.DataFrame()

print("\n--- Preferred Model Selection Logic (Main Run) ---")
conclusion_pool = 'Cannot Run'; conclusion_time = 'Cannot Run'; conclusion_hausman = 'Cannot Run'
if not spec_test_df.empty:
    pool_row = spec_test_df[spec_test_df['Test'] == 'F-test (Poolability - Entity)']; conclusion_pool = pool_row['Conclusion'].iloc[0] if not pool_row.empty else 'Cannot Run'
    time_row = spec_test_df[spec_test_df['Test'] == 'F-test (Time Effects)']; conclusion_time = time_row['Conclusion'].iloc[0] if not time_row.empty else 'Cannot Run'
    hausman_row = spec_test_df[spec_test_df['Test'] == 'Hausman (FE vs RE - Simple)']; conclusion_hausman = hausman_row['Conclusion'].iloc[0] if not hausman_row.empty else 'Cannot Run'
print(f"  - Poolability Test Result: '{conclusion_pool}'"); print(f"  - Hausman Test (Simple Models) Result: '{conclusion_hausman}'"); print(f"  - Time Effects Test Result: '{conclusion_time}'")

preferred_model_key = None # Reset
if 'Time Effects Significant' in conclusion_time and is_fe_tw_valid:
    print("  - Logic: Time effects significant & Two-Way FE valid."); preferred_model_key = 'FE_TwoWay_Simple'; print("  -> Tentative Preference: FE Two-Way (Simple).")
elif 'Reject Pooling' in conclusion_pool and is_fe_valid:
    print("  - Logic: Pooling rejected & Entity FE valid.")
    if 'Prefer FE' in conclusion_hausman: print("  - Logic: Hausman also prefers FE."); preferred_model_key = 'FE_Entity_Simple'; print("  -> Tentative Preference: FE Entity (Simple).")
    else: print(f"  - Logic: Hausman ({conclusion_hausman}), but Poolability rejects Pooled. Prioritizing FE Entity."); preferred_model_key = 'FE_Entity_Simple'; print("  -> Tentative Preference: FE Entity (Simple).")
elif is_re_valid and 'Prefer FE' not in conclusion_hausman:
     print("  - Logic: FE not selected/valid, RE valid, Hausman doesn't strongly prefer FE."); preferred_model_key = 'RE_Simple'; print("  -> Tentative Preference: RE (Simple).")
elif 'Cannot Reject Pooling' in conclusion_pool and is_pooled_valid:
     print("  - Logic: FE/RE models not selected/valid, Pooling allowed, Pooled OLS valid."); preferred_model_key = 'Pooled_Interaction'; print("  -> Tentative Preference: Pooled OLS (Interaction).")
if preferred_model_key is None:
    print("\n  - No model selected by primary logic. Applying fallback...")
    fallback_order = ['FE_TwoWay_Simple', 'FE_Entity_Simple', 'RE_Simple', 'Pooled_Interaction']
    for model_key_fb in fallback_order:
        if model_key_fb in regression_results and isinstance(regression_results.get(model_key_fb), (PanelEffectsResults, RandomEffectsResults)):
            summary_val = model_summaries.get(model_key_fb)
            if not isinstance(summary_val, str): preferred_model_key = model_key_fb; print(f"  -> Fallback Selection: '{preferred_model_key}'."); break
            else: print(f"      -> Skipping fallback {model_key_fb} (summary failed).")
if preferred_model_key and (preferred_model_key not in regression_results or not isinstance(regression_results.get(preferred_model_key), (PanelEffectsResults, RandomEffectsResults))): print(f"  -> ERROR: Preferred model '{preferred_model_key}' not valid. Resetting."); preferred_model_key = None
if preferred_model_key and preferred_model_key in model_summaries and isinstance(model_summaries[preferred_model_key], str): print(f"  -> ERROR: Preferred model '{preferred_model_key}' failed summary. Resetting."); preferred_model_key = None
if preferred_model_key is None: print("\n  -> !!! CRITICAL: No valid model results available after fallback. !!!")
print(f"\n---> Final Preferred Model Selected (Main Run): {preferred_model_key if preferred_model_key else 'None (Review Logs)'} <---")

print(f"\n--- Interpretation (Based on '{preferred_model_key if preferred_model_key else 'Available Models'}') ---")
interpretation_provided = False
def get_coeff_info(results, var_name):
    if results and hasattr(results, 'params') and var_name in results.params.index:
        coeff = results.params.get(var_name); pval = results.pvalues.get(var_name); stderr = results.std_errors.get(var_name)
        sig_marker = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else '.' if pval < 0.1 else ''
        return f"Coeff={coeff:.4f}, SE={stderr:.4f}, Pval={pval:.4f} {sig_marker}"
    return f"Variable '{var_name}' not found or results invalid."

if preferred_model_key and preferred_model_key in regression_results and isinstance(regression_results[preferred_model_key], (PanelEffectsResults, RandomEffectsResults)):
    preferred_model_results = regression_results[preferred_model_key]; formula_used_key = model_formulas_used.get(preferred_model_key, "Unknown")
    print(f"    Interpreting Preferred Model: '{preferred_model_key}' (Formula Type: {formula_used_key})")
    if primary_esg_var:
        print(f"\n    Interpretation for '{primary_esg_var}':")
        if preferred_model_key == 'Pooled_Interaction' and any(':' in p for p in preferred_model_results.params.index):
             ref_cat_used = 'Unknown'; formula_string_to_parse = formula_pooled_interaction
             if formula_string_to_parse:
                  try: match = re.search(r"Treatment\(reference='([^']+)'\)", formula_string_to_parse); ref_cat_used = match.group(1) if match else 'Unknown'
                  except Exception: pass
             print(f"      Model includes interactions (Ref Cat: '{ref_cat_used}')"); print(f"      - Baseline Effect ({ref_cat_used}): {get_coeff_info(preferred_model_results, primary_esg_var)}"); print("      - (Interactions: See Pooled summary)")
        elif preferred_model_key in ['FE_Entity_Simple', 'FE_TwoWay_Simple', 'RE_Simple']:
             effects_description = "Unknown"
             if 'FE_Entity' in preferred_model_key: effects_description = "Entity Fixed"
             elif 'FE_TwoWay' in preferred_model_key: effects_description = "Entity & Time Fixed"
             elif 'RE' in preferred_model_key: effects_description = "Random Entity"
             print(f"      Model estimates avg effect controlling for {effects_description} effects."); print(f"      - Average Effect: {get_coeff_info(preferred_model_results, primary_esg_var)}")
        else: print(f"      - Unknown structure. Basic Effect: {get_coeff_info(preferred_model_results, primary_esg_var)}")
    else: print("    -> Primary ESG variable not specified/found.")
    print("\n    Interpretation Notes:"); print(f"      - Results based on '{preferred_model_key}'. Check Pval.");
    if IMPUTE_DATA and initial_missing_stats: print("      - !!! CAVEAT: Uses imputed data. High initial missingness. Compare w/ Sensitivity. !!!")
    print("      - Consider economic significance."); interpretation_provided = True
elif RUN_WITHOUT_IMPUTATION_SENSITIVITY:
    print("\n    -> Main run failed/not selected. Checking Sensitivity Run...")
    sens_key_to_interpret = None; sens_results = None; sens_pref_order = ['FE_Entity_Simple_Sens', 'Pooled_Interaction_Sens']
    for key in sens_pref_order:
        result_obj = sensitivity_regression_results.get(key) # Check result object
        summary_obj_or_err = sensitivity_summaries.get(key) # Check summary object/error
        if isinstance(result_obj, (PanelEffectsResults, RandomEffectsResults)) and not isinstance(summary_obj_or_err, str):
            sens_key_to_interpret = key; sens_results = result_obj; break
    if sens_key_to_interpret and sens_results:
        print(f"    -> Interpreting Sensitivity Model: '{sens_key_to_interpret}' (NO IMPUTATION) as fallback."); print("       !!! CAVEATS: Smaller subset of data. !!!")
        if primary_esg_var: print(f"       - ESG Effect ('{primary_esg_var}'): {get_coeff_info(sens_results, primary_esg_var)}")
        else: print("       - Primary ESG variable not found."); interpretation_provided = True
    else: print("    -> Sensitivity models also failed or were skipped.")
if not interpretation_provided: print("\n    -> No valid model results found to provide interpretation.")

# ==============================================================================
# --- Step 9: Print Consolidated Results to Console ---
# ==============================================================================
print("\n\n==============================================================================")
print(f"--- 9. Consolidated Analysis Results ({SCRIPT_VERSION}) ---")
print("==============================================================================")

print(f"\n--- Panel Model Summaries (Main Run - Imputed: {IMPUTE_DATA}) ---")
if not model_summaries: print("No models run/attempted.")
else:
    model_order = ['Pooled_Interaction', 'RE_Simple', 'FE_Entity_Simple', 'FE_TwoWay_Simple']
    for name in model_order:
        if name in model_summaries:
            result_or_error_summary = model_summaries[name]
            formula_used_key = model_formulas_used.get(name, "Unknown")
            print(f"\n--- Model: {name} ---"); print(f"    Formula Type Used: {formula_used_key}")
            if isinstance(result_or_error_summary, str): print(f"    -> Model Failed/Summary Error: {result_or_error_summary}")
            elif hasattr(result_or_error_summary, 'tables'):
                try:
                    if hasattr(result_or_error_summary, 'tables') and isinstance(result_or_error_summary.tables, list):
                         for table in result_or_error_summary.tables: print(table.as_text())
                    else: print(str(result_or_error_summary))
                except Exception as e: print(f"    -> Error formatting/printing summary tables for '{name}': {e}");
                try: print(str(result_or_error_summary))
                except: print("    -> Could not even convert summary object to string.")
            else: print(f"    -> Unknown result status stored (Type: {type(result_or_error_summary)}): {result_or_error_summary}")

if RUN_WITHOUT_IMPUTATION_SENSITIVITY:
    print("\n\n--- Panel Model Summaries (Sensitivity Run - NO IMPUTATION) ---")
    if not sensitivity_summaries: print("No sensitivity models run/results available.")
    else:
        sens_model_order = ['Pooled_Interaction_Sens', 'FE_Entity_Simple_Sens']
        for name in sens_model_order:
             if name in sensitivity_summaries:
                result_or_error_summary = sensitivity_summaries[name]
                formula_used_key = sensitivity_formulas_used.get(name, "Unknown")
                print(f"\n--- Model: {name} ---"); print(f"    Formula Type Used: {formula_used_key}")
                if isinstance(result_or_error_summary, str): print(f"    -> Model Failed/Summary Error: {result_or_error_summary}")
                elif hasattr(result_or_error_summary, 'tables'):
                     try:
                         if hasattr(result_or_error_summary, 'tables') and isinstance(result_or_error_summary.tables, list):
                              for table in result_or_error_summary.tables: print(table.as_text())
                         else: print(str(result_or_error_summary))
                     except Exception as e: print(f"    -> Error formatting/printing summary tables for '{name}': {e}");
                     try: print(str(result_or_error_summary))
                     except: print("    -> Could not even convert summary object to string.")
                else: print(f"    -> Unknown result status stored for {name} (Type: {type(result_or_error_summary)}): {result_or_error_summary}")

print(f"\n\n--- Preferred Model Selection & Comparison (Main Run) ---")
print(f"  -> Preferred model selected: {preferred_model_key if preferred_model_key else 'None (Review Logs)'}"); print(f"  -> Review specification tests and theory.")
if preferred_model_key and RUN_WITHOUT_IMPUTATION_SENSITIVITY:
    sens_key_map = {'Pooled_Interaction': 'Pooled_Interaction_Sens', 'FE_Entity_Simple': 'FE_Entity_Simple_Sens', 'FE_TwoWay_Simple': None, 'RE_Simple': None}
    sens_key = sens_key_map.get(preferred_model_key)
    if sens_key and sens_key in sensitivity_summaries:
        sens_result_or_error = sensitivity_summaries[sens_key]
        if not isinstance(sens_result_or_error, str): print(f"  -> COMPARISON: Sensitivity '{sens_key}' ran successfully. Compare results.")
        else: print(f"  -> COMPARISON NOTE: Sensitivity counterpart '{sens_key}' failed/skipped ({sens_result_or_error}).")
    elif sens_key: print(f"  -> COMPARISON NOTE: Sensitivity counterpart '{sens_key}' not found in sensitivity results.")
    else: print(f"  -> COMPARISON NOTE: No sensitivity counterpart defined for '{preferred_model_key}'.")

print("\n--- Interpretation Summary (Refer to Step 7) ---")
if interpretation_provided: print("  -> Interpretation provided in Step 7.")
else: print("  -> No interpretation provided (model failures).")

print("\n--- Specification Tests Summary (Main Run) ---")
if 'spec_test_df' in locals() and isinstance(spec_test_df, pd.DataFrame) and not spec_test_df.empty:
    try: print(spec_test_df.to_string(index=False, justify='left', max_colwidth=60))
    except Exception as print_err: print(f"  -> Print error: {print_err}"); print(spec_test_df)
else: print("  -> Specification tests unavailable.")

print("\n--- VIF Results (Main Run Data) ---"); print("  (VIF > 10 indicates potential issues)")
if vif_results_total is not None and isinstance(vif_results_total, pd.DataFrame): print("\n  VIF (Factors + Total ESG):"); print(vif_results_total.sort_values('VIF', ascending=False).to_string(index=False))
else: print("\n  VIF check (Total ESG) N/A or failed.")
if vif_results_components is not None and isinstance(vif_results_components, pd.DataFrame): print("\n  VIF (Factors + ESG Components):"); print(vif_results_components.sort_values('VIF', ascending=False).to_string(index=False))
else: print("\n  VIF check (Components) N/A or failed.")

print("\n\n--- Overall Reliability Assessment & Disclaimers ---")
print("  - Data Quality: Verify sources (Steps 2 & 3 logs).")
print(f"  - Imputation ({'MICE' if IMPUTE_DATA else 'Disabled'}): Main run {'used' if IMPUTE_DATA else 'did not use'} imputation.")
if IMPUTE_DATA and initial_missing_stats: high_missing_cols = [col for col, pct in initial_missing_stats.items() if pct > 25];
if IMPUTE_DATA and initial_missing_stats and high_missing_cols: print(f"    -> !!! CONCERN: High initial missingness (>25%): {high_missing_cols}.")
if IMPUTE_DATA: print("    -> Compare Main Run vs. Sensitivity Run (if successful).")
print("  - Model Specification & Validity:")
main_models_failed_count = sum(1 for res in model_summaries.values() if isinstance(res, str))
if main_models_failed_count == len(model_summaries): print("    -> !!! CRITICAL: All main run models failed. Review Step 6 logs. !!!")
elif main_models_failed_count > 0: print(f"    -> Warning: {main_models_failed_count} main run model(s) failed/summary error. Review logs.")
# Check specific model failures
if 'Pooled_Interaction' in model_summaries and isinstance(model_summaries['Pooled_Interaction'], str): print("    -> Warning: Pooled OLS model failed or had summary error.")
elif 'Pooled_Interaction' in regression_results and isinstance(regression_results['Pooled_Interaction'], PanelEffectsResults):
    # *** FINAL CORRECTED INDENTATION FOR TRY/EXCEPT BLOCK ***
    try:
        pooled_f_robust = regression_results['Pooled_Interaction'].f_statistic_robust.stat
        if not np.isfinite(pooled_f_robust) or pooled_f_robust < 0 :
             print("    -> Warning: Pooled OLS robust F-stat appears invalid. Check SE calculation.")
    except Exception:
        print("    -> Warning: Could not access Pooled OLS robust F-statistic.")
        pass # Ignore if attributes missing
    # *** END FINAL CORRECTION ***
if 'RE_Simple' in model_summaries and isinstance(model_summaries['RE_Simple'], str): print("    -> Warning: RE model failed. Hausman test invalid.")
fe_models_to_check = ['FE_Entity_Simple', 'FE_TwoWay_Simple']
for fe_key in fe_models_to_check:
     if fe_key in model_summaries:
          summary_val = model_summaries[fe_key]
          if isinstance(summary_val, str) and ('Singular' in summary_val or 'Error' in summary_val): print(f"    -> Warning: {fe_key} failed summary ({summary_val[:60]}...).")
          elif not isinstance(summary_val, str) and hasattr(summary_val, 'summary'):
               summary_str = str(summary_val);
               if 'Absorbed' in summary_str or 'dropped' in summary_str.lower(): print(f"    -> Warning: {fe_key} summary indicates absorbed/dropped vars.")

print("    -> Note: FE/RE models used simplified spec (no category interactions). Pooled OLS used interactions.")
print("    -> Review model selection (Step 7) and theoretical fit.")
print("  - Data Characteristics: Check VIF results and Step 4 category warnings.")
print("\n  --- Conclusion ---"); print("  -> Treat findings with caution. Prioritize successful Sensitivity Run if Main had issues/imputation.")

print("\n==============================================================================")
print("--- End of Consolidated Results ---")
print("==============================================================================")

print(f"\n--- Script Finished ({SCRIPT_VERSION}) ---")

--- E-commerce ESG Impact Analysis Script Started (Panel Only v13 - Final Check & Error Handling) ---
Tickers: ['AMZN', 'BABA', 'JD', 'EBAY', 'WMT', 'SE', 'MELI', 'PDD', 'ETSY', 'ZAL.DE', 'ALE.WA', 'TGT', '4755.T']
Analysis Period: 2020-01-01 to 2024-12-31
ESG Lag: 1 months
Imputation Enabled (Main Run - MICE): True
Run Sensitivity without Imputation: True
Factors Path: gd_Developed_5_Factors.csv (Source/Quality Not Verified by Script)
ESG Data Path: historic_esg_scores_ecommerce.csv (Source/Quality Not Verified by Script)

--- 1. Downloading and Preparing Stock Returns ---
  -> Stock price data processed for 13 tickers: ['4755.T', 'ALE.WA', 'AMZN', 'BABA', 'EBAY', 'ETSY', 'JD', 'MELI', 'PDD', 'SE', 'TGT', 'WMT', 'ZAL.DE']
  -> Stock monthly returns prepared: 2019-10-31 to 2024-12-31

--- 2. Loading and Preparing Factors Data ---
  -> Attempting to convert factor index 'Date' to datetime...
    -> Converted index using pandas default.
  -> Converting factor columns to numeric...
  -> I

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if {'Ticker', 'Date'}.issubset(panel_data_no_imputation.columns): panel_data_no_imputation['Date'] = pd.to_datetime(panel_data_no_imputation['Date']); panel_data_no_imputation = panel_data_no_imputation.set_index(['Ticker', 'Date']).sort_index(); print(f"  -> Sensitivity panel ready (No Imputation): {len(panel_data_no_imputation)} obs")


    -> Fit OK.
    -> Summary generation OK.

--- 6b. Panel Regression Analysis (Sensitivity Run - NO IMPUTATION) ---

  --- Fitting Pooled_Interaction_Sens (Pooled) ---
    Using Formula: ExcessReturn ~ 1 + cma + hml + mkt_rf + mom + rmw + smb + esg_total_score_lag1 * C(ESG_Category_lag1, Treatment(reference='Middle'))
     (Note: Using robust covariance for Pooled OLS)
    -> Fit OK.
    -> Summary generation OK.

  --- Fitting FE_Entity_Simple_Sens (FE_Entity) ---
    Using Formula: ExcessReturn ~ 1 + cma + hml + mkt_rf + mom + rmw + smb + esg_total_score_lag1
    -> Fit OK.
    -> Summary generation OK.

--- 7. Specification Tests & Interpretation (using Main Run results) ---

    Comparing FE (Simple) vs RE (Simple) - Hausman Test:
      -> Performing Hausman test via model comparison...
                      Model Comparison                     
                                FE_Simple         RE_Simple
-----------------------------------------------------------
Dep. Variable   

In [12]:
    try:
        pooled_f_robust = regression_results['Pooled_Interaction'].f_statistic_robust.stat
        # CORRECTED INDENTATION
        if not np.isfinite(pooled_f_robust) or pooled_f_robust < 0 :
             print("    -> Warning: Pooled OLS robust F-stat appears invalid. Check SE calculation.")
    except Exception:
        print("    -> Warning: Could not access Pooled OLS robust F-statistic.")
        pass