<a href="https://colab.research.google.com/github/henryonomakpo/The-Impact-of-ESG-Ratings-on-EV-Manufacturing-Industry/blob/main/Automated_Valuation_Models_and_Panel_Regression_for_Housing_Price_Index_and_ESG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Exploratory Data Analysis: Clean ESG_global dataset and save in CSV

In [3]:
pip install openpyxl



In [1]:
import pandas as pd
import re # Import regular expression module for cleaning

def clean_col_name(name):
    """Cleans a string to make it a suitable column name."""
    # Convert to lowercase
    name = name.lower()
    # Replace spaces and common problematic characters with underscores
    name = re.sub(r'[ /():,-]+', '_', name)
    # Remove any characters that are not alphanumeric or underscore
    name = re.sub(r'[^\w_]+', '', name)
    # Remove leading/trailing underscores
    name = name.strip('_')
    # Replace multiple consecutive underscores with a single one
    name = re.sub(r'_+', '_', name)
    # Handle specific cases like % -> pct
    name = name.replace('_pct_', '_pct_') # Ensure pct replacement works if spaces were around %
    name = name.replace('percent', 'pct')
    name = name.replace('_of_', '_')
    name = name.replace('_per_', '_per_') # Keep per if it resulted from /
    return name

# --- Configuration ---
input_csv_path = '/content/WB_ESG_global.csv'
output_csv_path = 'WB_ESG_wide_format_for_hedonic.csv'
# *** CHANGE THE SEPARATOR HERE ***
separator = ',' # Changed from '\t' to ',' assuming it's a standard CSV

# --- Load Data ---
try:
    df = pd.read_csv(input_csv_path, sep=separator)
    print(f"Successfully loaded {input_csv_path}")
    # Add a check immediately after loading
    if df.shape[1] <= 1:
         print("\nWarning: DataFrame loaded with only one column.")
         print("Please double-check the separator used in the file.")
         print("Header found:", df.columns.tolist())
         # You might want to try other separators like ';' if ',' doesn't work
         # exit() # Optional: stop execution if loading failed

    print("Original DataFrame shape:", df.shape)
    print("Original columns:", df.columns.tolist())
    # Optional: Print first few rows to verify structure
    # print("\nFirst 5 rows of loaded data:")
    # print(df.head())

except FileNotFoundError:
    print(f"Error: File not found at {input_csv_path}")
    exit()
except Exception as e:
    print(f"An error occurred while loading the CSV: {e}")
    print("Please ensure the file path and separator are correct.")
    exit()

# --- Transformation ---

# 1. Melt the Year Columns
id_vars = ['Country Name', 'ESG_indicators']
# Dynamically find year columns (assuming they are 4-digit years)
year_columns = [col for col in df.columns if col.isdigit() and len(col) == 4]
if not year_columns:
    print("\nError: No year columns found after loading.")
    print("Please verify the column names in your CSV file header.")
    print("Columns found by pandas:", df.columns.tolist())
    exit() # Stop if years aren't found

print(f"\nMelting years: {year_columns}")
df_long = pd.melt(df,
                  id_vars=id_vars,
                  value_vars=year_columns,
                  var_name='Year',
                  value_name='Value')

# Convert Year column to numeric
df_long['Year'] = pd.to_numeric(df_long['Year'])

# 2. Pivot the Indicator Column
print("Pivoting data...")
try:
    # Handle potential duplicate entries before pivoting
    df_long = df_long.drop_duplicates(subset=['Country Name', 'Year', 'ESG_indicators'], keep='first')

    df_wide = df_long.pivot_table(index=['Country Name', 'Year'],
                                  columns='ESG_indicators',
                                  values='Value',
                                  aggfunc='first') # Use 'first' or another aggfunc if needed
    df_wide.reset_index(inplace=True)
    print("Pivoting successful.")
except Exception as e:
    print(f"An error occurred during pivoting: {e}")
    print("This might happen if there are duplicate combinations of Country Name, Year, and ESG_indicators.")
    exit()

# 3. Clean Column Names
print("Cleaning column names...")
original_columns = df_wide.columns.tolist()
cleaned_columns = {}
for col in original_columns:
    if col not in ['Country Name', 'Year']:
        cleaned_columns[col] = clean_col_name(col)
    else:
        cleaned_columns[col] = col # Keep 'Country Name' and 'Year'

df_wide.rename(columns=cleaned_columns, inplace=True)
print("Column names cleaned.")
print("\nExample cleaned column names:", list(df_wide.columns[:10])) # Show first few cleaned names

# --- Save Transformed Data ---
try:
    df_wide.to_csv(output_csv_path, index=False)
    print(f"\nTransformed data saved successfully to {output_csv_path}")
    print("Transformed DataFrame shape:", df_wide.shape)
except Exception as e:
    print(f"An error occurred while saving the CSV: {e}")

# --- Display Head ---
print("\nFirst 5 rows of the transformed data:")
print(df_wide.head())

Successfully loaded /content/WB_ESG_global.csv
Original DataFrame shape: (6936, 18)
Original columns: ['Country Name', 'ESG_indicators', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

Melting years: ['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Pivoting data...
Pivoting successful.
Cleaning column names...
Column names cleaned.

Example cleaned column names: ['Country Name', 'Year', 'coastal_protection', 'control_corruption_estimate', 'economic_and_social_rights_performance_score', 'electricity_production_from_coal_sources_total', 'energy_imports_net_energy_use', 'energy_intensity_level_primary_energy_mj_2017_ppp_gdp', 'energy_use_kg_oil_equivalent_per_capita', 'fertility_rate_total_births_per_woman']

Transformed data saved successfully to WB_ESG_wide_format_for_hedonic.csv
Transformed DataFrame shape: (3280, 36)

First 5 

### Explanation of the Reshaping (Step 5):

### Set Index: merged_data.set_index(['Country', 'Year'], inplace=True) turns the 'Country' and 'Year' columns into a MultiIndex. This is necessary for the stack() operation.

### Stack: merged_data.stack() takes all the remaining columns (the indicators + HousePriceIndex) and "stacks" them, moving them from columns into the index. The result (stacked_data) is a Pandas Series with a three-level MultiIndex: (Country, Year, IndicatorName).

### Create New Index Names: A list comprehension [f"{country}_{year}_{indicator}" for (country, year, indicator) in stacked_data.index] iterates through the three levels of the MultiIndex of the stacked_data Series and creates the desired combined string format for each entry (e.g., 'Australia_2008_coastal_protection').

### Assign New Index: stacked_data.index = new_index replaces the MultiIndex of the Series with the new list of combined string names. Now the Series has a simple index where each label is the desired final column name.

### Convert to DataFrame and Transpose:

### stacked_data.to_frame() converts the Series into a DataFrame with one column (column name 0).

### .T transposes this DataFrame. Now the index labels (our desired column names) become the columns, and the single column of values becomes the single row of the DataFrame.

### Reset Index: transposed_df.reset_index(drop=True, inplace=True) removes the index name ('0') that might have been assigned during the to_frame().T process, leaving a clean single-row DataFrame

### Fill missing values for each country's ESG indicators.
### Merge the datasets.
### Save the final CSV.

In [1]:
import pandas as pd
import io
import numpy as np

# 1. Read the data directly from the CSV file using COMMA as the separator
file_path = '/content/ESG_indicators.csv'
try:
    # Try reading with comma separator first
    df_raw = pd.read_csv(file_path, sep=',')
    print(f"Successfully read CSV file: {file_path} (using comma separator).")
    print("Columns found:", df_raw.columns.tolist())

    # Strip leading/trailing whitespace from column names just in case
    df_raw.columns = df_raw.columns.str.strip()
    print("Columns after stripping whitespace:", df_raw.columns.tolist())

except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please ensure the file exists.")
    exit()
except Exception as e:
    print(f"Error reading CSV file with comma separator: {e}")
    # If comma fails, try tab again as a fallback, but indicate the issue
    print("Trying to read with tab separator as a fallback...")
    try:
        df_raw = pd.read_csv(file_path, sep='\t') # Fallback to tab
        print(f"Successfully read CSV file: {file_path} (using TAB separator).")
        print("Columns found:", df_raw.columns.tolist())
        df_raw.columns = df_raw.columns.str.strip()
        print("Columns after stripping whitespace:", df_raw.columns.tolist())
    except Exception as e2:
        print(f"Error reading CSV file with TAB separator as well: {e2}")
        print("Please verify the separator used in your CSV file (comma or tab).")
        exit()


# Make a copy
df = df_raw.copy()

# ----- Data Cleaning and Preparation -----
# Check if 'Year' column exists *after potential stripping*
year_col_name = 'Year' # The expected name
if year_col_name not in df.columns:
    print(f"Error: Column '{year_col_name}' not found in the DataFrame columns: {df.columns.tolist()}")
    print("Please check the actual column name in your CSV header.")
    exit()

# Ensure 'Year' is integer for correct column naming
try:
    df = df[pd.to_numeric(df[year_col_name], errors='coerce').notna()] # Keep only rows with numeric years
    if df.empty:
        print(f"Error: No valid numeric years found in the '{year_col_name}' column after cleaning.")
        exit()
    df[year_col_name] = df[year_col_name].astype(int)
except ValueError as e:
    print(f"Error converting '{year_col_name}' column to integer after cleaning: {e}. Please check input data.")
    exit() # Stop execution if Year cannot be converted

# Identify columns
country_col_name = 'Country' # The expected name
if country_col_name not in df.columns:
     print(f"Error: Column '{country_col_name}' not found in the DataFrame columns: {df.columns.tolist()}")
     print("Please check the actual column name in your CSV header.")
     exit()

id_vars = [country_col_name, year_col_name]
value_vars = [col for col in df.columns if col not in id_vars]
if not value_vars:
    print("Error: No value columns found (indicators other than Country and Year).")
    exit()

# Convert value columns to numeric, coercing errors
for col in value_vars:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# ----- Pivoting -----
# Check for duplicate Country-Year entries before pivoting
duplicates = df[df.duplicated(subset=id_vars, keep=False)]
if not duplicates.empty:
    print(f"Warning: {len(duplicates)} duplicate Country-Year entries found. Pivot table will use the first valid occurrence.")
    # print(duplicates.head()) # Uncomment to see examples

try:
    df_pivoted = df.pivot_table(index=country_col_name,
                                columns=year_col_name,
                                values=value_vars,
                                aggfunc='first') # Keeps the first value if duplicates exist
except Exception as e:
    print(f"Error during pivoting: {e}")
    exit()

# ----- Flattening Columns -----
# Create new column names: Indicator_Year
new_columns = [f"{indicator}_{int(year)}" for indicator, year in df_pivoted.columns]
df_pivoted.columns = new_columns

# ----- Final Formatting -----
# Reset the index to make 'Country' a regular column
df_pivoted = df_pivoted.reset_index()

# Sort columns: 'Country' first, then alphabetically by indicator, then numerically by year
def sort_key(col_name):
    if col_name == country_col_name:
        return ('', -1) # Ensure Country comes first
    parts = col_name.rsplit('_', 1)
    indicator = parts[0]
    try:
        year = int(parts[1])
        return (indicator, year)
    except (IndexError, ValueError):
        return (col_name, 0) # Fallback for columns not matching pattern

df_pivoted = df_pivoted[sorted(df_pivoted.columns, key=sort_key)]


# ----- Output -----
# Display the first few rows and selected columns to show the structure
print("\nPivoted DataFrame head (showing selected indicator columns):")
# Select 'Country' and columns for a couple of indicators across a few years
# Make sure the base indicator names match those found after stripping whitespace
control_corr_prefix = 'control_corruption_estimate'
voice_acc_prefix = 'voice_and_accountability_estimate'

cols_to_show = [country_col_name] + \
               [col for col in df_pivoted.columns if col.startswith(f"{control_corr_prefix}_201")] + \
               [col for col in df_pivoted.columns if col.startswith(f"{voice_acc_prefix}_201")]

# Set pandas display options to show more columns if needed
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

try:
    # Filter columns that actually exist in the pivoted dataframe
    existing_cols_to_show = [col for col in cols_to_show if col in df_pivoted.columns]
    if not existing_cols_to_show or existing_cols_to_show == [country_col_name]:
         print("Selected columns for display not found/available. Showing first few columns instead:")
         print(df_pivoted.head())
    else:
        # Limit the number of columns printed to the console for readability
        max_cols_print = 20
        if len(existing_cols_to_show) > max_cols_print:
             print(f"(Displaying first {max_cols_print} selected columns)")
             existing_cols_to_show = existing_cols_to_show[:max_cols_print]
        print(df_pivoted[existing_cols_to_show].head())

except KeyError as e:
    print(f"\nWarning: Could not display selected columns due to error: {e}")
    print("Showing available columns' head instead:")
    print(df_pivoted.head())
except Exception as e:
     print(f"\nAn unexpected error occurred during display: {e}")
     print("Showing available columns' head instead:")
     print(df_pivoted.head())


# Save the result to a new CSV file
# IMPORTANT: Saving as TAB-separated as requested in the original prompt.
# If you want comma-separated output, change sep='\t' to sep=','
output_filename = 'ESG_indicators_wide_by_year.csv'
try:
    df_pivoted.to_csv(output_filename, sep='\t', na_rep='', index=False)
    print(f"\nSuccessfully reshaped data and saved to '{output_filename}' (using TAB separator).")
    print(f"The output file has countries in rows and separate columns for each Indicator_Year.")
except Exception as e:
    print(f"\nError saving the file '{output_filename}': {e}")

Successfully read CSV file: /content/ESG_indicators.csv (using comma separator).
Columns found: ['Country ', 'Year', 'coastal_protection', 'control_corruption_estimate', 'economic_and_social_rights_performance_score', 'electricity_production_from_coal_sources_total', 'energy_imports_net_energy_use', 'energy_intensity_level_primary_energy_mj_2017_ppp_gdp', 'energy_use_kg_oil_equivalent_per_capita', 'fertility_rate_total_births_per_woman', 'food_production_index_2014_2016_100', 'fossil_fuel_energy_consumption_total', 'gdp_growth_annual', 'gini_index', 'government_expenditure_on_education_total_government_expenditure', 'hospital_beds_per_1_000_people', 'income_share_held_by_lowest_20', 'individuals_using_the_internet_population', 'land_surface_temperature', 'level_water_stress_freshwater_withdrawal_as_a_proportion_available_freshwater_resources', 'life_expectancy_at_birth_total_years', 'literacy_rate_adult_total_people_ages_15_and_above', 'people_using_safely_managed_drinking_water_servic

### Hedonic Price Model

### Explanation:


### File Paths and Separator: Define the input and output file names and the separator (\t for tab) at the top for easy modification.


### Load ESG Data: Reads the previously created ESG_indicators_wide_by_year.csv using the specified separator. Includes error handling.


### Load House Price Data: Reads the OECD_HOUSE_PRICES.csv. It first tries the specified separator. If it fails to find the 'Country' column (which might happen if the separator was wrong and it read the header as one column), it tries again with a comma separator as a fallback. Includes error handling.


### Clean 'Country' Columns: Uses .str.strip() on the 'Country' column in both DataFrames to remove any accidental leading or trailing spaces, which can prevent a successful merge.


### Rename House Price Columns: Identifies columns in df_house that are purely digits (likely years) and renames them to house_price_index_YEAR using a dictionary comprehension and the .rename() method. This makes the merged table clearer.


### Convert House Price Columns: Converts the newly renamed house price columns to numeric types, handling potential errors.

Merge: Performs the pd.merge() operation:

df_esg: The left DataFrame (all its rows will be kept).

df_house: The right DataFrame.

on='Country': The column to match rows on.

how='left': Specifies the type of merge.

Save Output: Saves the final df_merged DataFrame to MERGED_ESG_House_Prices.csv using the specified separator (\t) and representing missing values (NaN, which occur for countries not in the house price data) as empty strings (''). index=False prevents pandas from writing its default row index.

In [2]:
import pandas as pd
import numpy as np

# --- Configuration ---
esg_wide_file = '/content/ESG_indicators_wide_by_year.csv' # File created by the previous script
house_price_file = '/content/OECD_HOUSE_PRICES.csv'
merged_output_file = 'MERGED_ESG_House_Prices.csv'
separator = '\t' # Separator used in BOTH input files and for the output file

# --- Load ESG Data ---
print(f"Loading ESG data from: {esg_wide_file}")
try:
    # Assuming the ESG wide file was saved correctly with tabs
    df_esg = pd.read_csv(esg_wide_file, sep=separator)
    print("ESG data loaded successfully.")
    print("ESG data shape:", df_esg.shape)
    # print("ESG columns:", df_esg.columns.tolist()) # Uncomment to verify columns
except FileNotFoundError:
    print(f"Error: ESG file not found at '{esg_wide_file}'. Please ensure it exists and the path is correct.")
    exit()
except Exception as e:
    print(f"Error reading ESG file '{esg_wide_file}': {e}")
    exit()

# Clean ESG Country column
if 'Country' in df_esg.columns:
    df_esg['Country'] = df_esg['Country'].str.strip()
else:
    print(f"Error: 'Country' column not found in {esg_wide_file}")
    exit()

# --- Load House Price Data ---
print(f"\nLoading House Price data from: {house_price_file}")
try:
    # Attempt reading with the specified separator (likely tab based on format)
    df_house = pd.read_csv(house_price_file, sep=separator)
    print("House Price data loaded successfully.")
    print("House Price data shape:", df_house.shape)
    # print("House Price columns:", df_house.columns.tolist()) # Uncomment to verify

    # Clean House Price Country column
    if 'Country' in df_house.columns:
         df_house['Country'] = df_house['Country'].str.strip()
         print("Cleaned 'Country' column in house price data.")
    else:
         print(f"Error: 'Country' column not found in {house_price_file}")
         # Attempt to read with comma separator as fallback if tab failed badly
         print("Attempting to read house price file with comma separator...")
         df_house = pd.read_csv(house_price_file, sep=',')
         if 'Country' in df_house.columns:
             print("Read successful with comma separator.")
             df_house.columns = df_house.columns.str.strip() # Clean again
             df_house['Country'] = df_house['Country'].str.strip()
         else:
             print("Error: Could not find 'Country' column even with comma separator.")
             exit()

except FileNotFoundError:
    print(f"Error: House Price file not found at '{house_price_file}'.")
    exit()
except Exception as e:
    print(f"Error reading House Price file '{house_price_file}': {e}")
    exit()


# --- Prepare House Price Data for Merge ---
# Rename year columns to avoid potential conflicts and be descriptive
year_columns_house = [col for col in df_house.columns if col.isdigit()] # Find columns that are years
rename_dict = {year: f"house_price_index_{year}" for year in year_columns_house}
df_house.rename(columns=rename_dict, inplace=True)
print(f"Renamed {len(rename_dict)} year columns in house price data (e.g., {list(rename_dict.values())[0]}).")

# Convert renamed year columns to numeric
for new_col_name in rename_dict.values():
     if new_col_name in df_house.columns:
         df_house[new_col_name] = pd.to_numeric(df_house[new_col_name], errors='coerce')


# --- Merge DataFrames ---
print(f"\nMerging datasets on 'Country' column (how='left')...")
try:
    df_merged = pd.merge(df_esg, df_house, on='Country', how='left')
    print("Merge successful.")
    print("Merged data shape:", df_merged.shape)
except KeyError:
    print("Error: Merge failed. Ensure 'Country' column exists and is named identically in both files after cleaning.")
    exit()
except Exception as e:
    print(f"An error occurred during merge: {e}")
    exit()

# --- Optional: Display Sample of Merged Data ---
print("\nHead of Merged DataFrame (showing Country and some house price columns):")
cols_to_display = ['Country'] + [col for col in df_merged.columns if col.startswith('house_price_index_201')]
# Limit the number of columns printed to the console for readability
max_cols_print = 15
if len(cols_to_display) > max_cols_print:
     print(f"(Displaying first {max_cols_print} relevant columns)")
     cols_to_display = cols_to_display[:max_cols_print]

try:
    print(df_merged[cols_to_display].head())
except KeyError:
     print("Could not display the specific house price columns, showing generic head.")
     print(df_merged.head())


# --- Save Merged Data ---
try:
    df_merged.to_csv(merged_output_file, sep=separator, na_rep='', index=False)
    print(f"\nSuccessfully merged data and saved to '{merged_output_file}'")
except Exception as e:
    print(f"\nError saving the merged file '{merged_output_file}': {e}")

Loading ESG data from: /content/ESG_indicators_wide_by_year.csv
ESG data loaded successfully.
ESG data shape: (47, 545)

Loading House Price data from: /content/OECD_HOUSE_PRICES.csv
House Price data loaded successfully.
House Price data shape: (47, 1)
Error: 'Country' column not found in /content/OECD_HOUSE_PRICES.csv
Attempting to read house price file with comma separator...
Read successful with comma separator.
Renamed 16 year columns in house price data (e.g., house_price_index_2009).

Merging datasets on 'Country' column (how='left')...
Merge successful.
Merged data shape: (47, 580)

Head of Merged DataFrame (showing Country and some house price columns):
     Country  house_price_index_2010  house_price_index_2011  house_price_index_2012  house_price_index_2013  house_price_index_2014  house_price_index_2015  house_price_index_2016  house_price_index_2017  house_price_index_2018  house_price_index_2019
0  Australia                    89.0                    85.2                 

### save merged dataset as .xlsx
### pip install openpyxl

In [4]:
import pandas as pd
import numpy as np
# We don't need the 'io' module if we are reading directly from files

# --- Configuration ---
esg_wide_file = 'ESG_indicators_wide_by_year.csv' # File created by the previous script
house_price_file = '/content/OECD_HOUSE_PRICES.csv'
merged_output_file = 'MERGED_ESG_House_Prices.xlsx' # Changed extension to .xlsx
separator = '\t' # Separator used in the INPUT ESG file

# --- Load ESG Data ---
print(f"Loading ESG data from: {esg_wide_file}")
try:
    # Assuming the ESG wide file was saved correctly with tabs
    df_esg = pd.read_csv(esg_wide_file, sep=separator)
    print("ESG data loaded successfully.")
    print("ESG data shape:", df_esg.shape)
except FileNotFoundError:
    print(f"Error: ESG file not found at '{esg_wide_file}'. Please ensure it exists and the path is correct.")
    exit()
except Exception as e:
    print(f"Error reading ESG file '{esg_wide_file}': {e}")
    exit()

# Clean ESG Country column
if 'Country' in df_esg.columns:
    df_esg['Country'] = df_esg['Country'].str.strip()
else:
    print(f"Error: 'Country' column not found in {esg_wide_file}")
    exit()

# --- Load House Price Data ---
print(f"\nLoading House Price data from: {house_price_file}")
try:
    # Use a flexible separator initially, assuming it might be comma or tab with spaces
    df_house = pd.read_csv(house_price_file, sep='\s*[,|\t]\s*', engine='python')
    print("House Price data loaded successfully.")
    print("House Price data shape:", df_house.shape)
    print("House Price columns found:", df_house.columns.tolist())

    # Clean House Price Country column
    df_house.columns = df_house.columns.str.strip() # Strip whitespace first
    print("House Price columns after stripping:", df_house.columns.tolist())
    if 'Country' in df_house.columns:
         df_house['Country'] = df_house['Country'].str.strip()
         print("Cleaned 'Country' column in house price data.")
    else:
         print(f"Error: 'Country' column not found in {house_price_file}. Columns are: {df_house.columns.tolist()}")
         exit()

except FileNotFoundError:
    print(f"Error: House Price file not found at '{house_price_file}'.")
    exit()
except Exception as e:
    print(f"Error reading House Price file '{house_price_file}': {e}")
    exit()


# --- Prepare House Price Data for Merge ---
# Rename year columns to avoid potential conflicts and be descriptive
year_columns_house = [col for col in df_house.columns if col.isdigit()] # Find columns that are years
rename_dict = {year: f"house_price_index_{year}" for year in year_columns_house}
df_house.rename(columns=rename_dict, inplace=True)
print(f"Renamed {len(rename_dict)} year columns in house price data (e.g., {list(rename_dict.values())[0] if rename_dict else 'None'}).")

# Convert renamed year columns to numeric
for new_col_name in rename_dict.values():
     if new_col_name in df_house.columns:
         df_house[new_col_name] = pd.to_numeric(df_house[new_col_name], errors='coerce')


# --- Merge DataFrames ---
print(f"\nMerging datasets on 'Country' column (how='left')...")
try:
    df_merged = pd.merge(df_esg, df_house, on='Country', how='left')
    print("Merge successful.")
    print("Merged data shape:", df_merged.shape)
except KeyError:
    print("Error: Merge failed. Ensure 'Country' column exists and is named identically in both files after cleaning.")
    exit()
except Exception as e:
    print(f"An error occurred during merge: {e}")
    exit()

# --- Optional: Display Sample of Merged Data ---
print("\nHead of Merged DataFrame (showing Country and some house price columns):")
cols_to_display = ['Country'] + [col for col in df_merged.columns if col.startswith('house_price_index_201')]
# Limit the number of columns printed to the console for readability
max_cols_print = 15
if len(cols_to_display) > max_cols_print:
     print(f"(Displaying first {max_cols_print} relevant columns)")
     cols_to_display = cols_to_display[:max_cols_print]

try:
    # Filter columns that actually exist in the pivoted dataframe
    existing_cols_to_display = [col for col in cols_to_display if col in df_merged.columns]
    if not existing_cols_to_display or existing_cols_to_display == ['Country']:
         print("Selected columns for display not found/available. Showing generic head.")
         print(df_merged.head())
    else:
        print(df_merged[existing_cols_to_display].head())

except KeyError as e:
     print(f"\nWarning: Could not display selected columns due to error: {e}")
     print("Showing available columns' head instead:")
     print(df_merged.head())
except Exception as e:
     print(f"\nAn unexpected error occurred during display: {e}")
     print("Showing available columns' head instead:")
     print(df_merged.head())


# --- Save Merged Data to Excel ---
# Make sure you have installed the engine: pip install openpyxl
print(f"\nAttempting to save merged data to Excel file: {merged_output_file}")
try:
    df_merged.to_excel(
        merged_output_file,
        sheet_name='Merged_Data', # Optional: name the sheet
        na_rep='',             # Represent missing values as empty strings
        index=False,           # Do not write the DataFrame index
        engine='openpyxl'      # Specify the engine (optional, default for xlsx)
    )
    print(f"\nSuccessfully merged data and saved to '{merged_output_file}'")
except ImportError:
    print("\nError saving to Excel: The 'openpyxl' library is required.")
    print("Please install it using: pip install openpyxl")
except Exception as e:
    print(f"\nError saving the Excel file '{merged_output_file}': {e}")

Loading ESG data from: ESG_indicators_wide_by_year.csv
ESG data loaded successfully.
ESG data shape: (47, 545)

Loading House Price data from: /content/OECD_HOUSE_PRICES.csv
House Price data loaded successfully.
House Price data shape: (47, 36)
House Price columns found: ['Country', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', 'Country.1', 'voice_and_accountability_estimate_2009', 'voice_and_accountability_estimate_2010', 'voice_and_accountability_estimate_2011', 'voice_and_accountability_estimate_2012', 'voice_and_accountability_estimate_2013', 'voice_and_accountability_estimate_2014', 'voice_and_accountability_estimate_2015', 'voice_and_accountability_estimate_2016', 'voice_and_accountability_estimate_2017', 'voice_and_accountability_estimate_2018', 'voice_and_accountability_estimate_2019', 'voice_and_accountability_estimate_2020', 'voice_and_accountability_estimate_2021', 'voice_and_accountability_est

### Explanation and Important Considerations:

### Loading: The code now correctly loads the Excel file using pd.read_excel.

### Preprocessing:

### Country Index: Setting 'Country' as the index is convenient for pandas operations.

### Variable Separation: Clearly defines dependent (house_price_index_*) and potential independent variables.

### Numeric Filtering: Ensures only numeric columns are initially considered for imputation and VIF.

### Imputation: Uses SimpleImputer with strategy='median'. This is a robust starting point. Consideration: For time-series data like this, more sophisticated methods (like forward/backward fill, interpolation, or model-based imputation like KNNImputer/IterativeImputer) might be better but add complexity. Median imputation across all features might smooth out temporal patterns within an indicator.

### VIF Calculation: Calculates VIF scores using statsmodels. It adds a constant as required for VIF calculation.

### Addressing Multicollinearity: The code now calculates VIF and identifies features above the threshold. Crucially, it currently doesn't automatically drop them (df_X_processed = df_X_imputed.copy()). Dropping requires careful consideration. High VIF between indicator_YEAR1 and indicator_YEAR2 is expected and not necessarily problematic if you model years separately. High VIF between different indicators within the same year is more concerning for standard OLS interpretation. Regularized regression (Ridge, Lasso) is often a better way to handle multicollinearity without losing variables.

### *Hedonic Regression (Year-by-Year):

### This is a simplification to demonstrate the core idea. It loops through each house_price_index_YYYY column.

### For each house price year YYYY, it selects only the independent variables from the same year (columns ending in _YYYY). This assumes a contemporaneous relationship for this baseline model.

### It runs a standard OLS regression using statsmodels.

### It prints key summary statistics for each year's model.

### Limitations: This ignores the time-series/panel nature of the data. House prices in one year heavily depend on the previous year. ESG factors might have lagged effects. A more advanced analysis would use panel data models (Fixed Effects, Random Effects, Pooled OLS with clustering) or time-series models (VAR, VECM if applicable).

### Advanced Framework Outlines:

### The code provides textual descriptions and conceptual snippets for the other frameworks.

### AVM/Uncertainty: Highlights the need for predictive models (like RF, GBT) and specific uncertainty techniques (conformal, quantile). Shows a basic RF example but notes the need for proper feature engineering (lags) and uncertainty methods.

### DSS: Explains the concept of MCDA (AHP, scoring) and weighting factors.

### Positive Impact: Focuses on linking specific ESG metrics to financial outcomes.

### Spatial Models: Mentions the need for geographic data and lists relevant model types (SAR, SEM, GWR) and the pysal library.

### Saving Processed Data: Saves the imputed DataFrame (df_X_processed) combined with the original dependent variables (df_Y) to a new Excel file. This is useful for subsequent, more focused modeling steps.

In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore', category=FutureWarning) # Ignore some VIF warnings
warnings.filterwarnings('ignore', category=UserWarning)   # Ignore some Seaborn warnings

# --- Configuration ---
merged_file = 'MERGED_ESG_House_Prices.xlsx'
output_processed_file = 'PROCESSED_MERGED_ESG_House_Prices.xlsx'
vif_threshold = 10 # Common threshold for VIF, adjust as needed

# --- 1. Load Data ---
print(f"Loading merged data from: {merged_file}")
try:
    df_merged = pd.read_excel(merged_file, engine='openpyxl')
    print("Merged data loaded successfully.")
    print("Original data shape:", df_merged.shape)
except FileNotFoundError:
    print(f"Error: Merged file not found at '{merged_file}'.")
    exit()
except Exception as e:
    print(f"Error reading Excel file '{merged_file}': {e}")
    exit()

# --- 2. Preprocessing ---

# Set Country as index (useful for analysis, can be reset later if needed)
if 'Country' in df_merged.columns:
    df_merged.set_index('Country', inplace=True)
    print("Set 'Country' as index.")
else:
    print("Warning: 'Country' column not found. Proceeding without setting index.")

# Separate Dependent (Y) and Independent (X) variable groups
dep_var_prefix = 'house_price_index_'
dependent_vars_cols = [col for col in df_merged.columns if col.startswith(dep_var_prefix)]
independent_vars_cols = [col for col in df_merged.columns if not col.startswith(dep_var_prefix)]

print(f"\nIdentified {len(dependent_vars_cols)} dependent variables (starting with '{dep_var_prefix}')")
print(f"Identified {len(independent_vars_cols)} potential independent variables.")

# Keep only numeric columns for X (for now)
df_X_numeric = df_merged[independent_vars_cols].select_dtypes(include=np.number)
df_Y = df_merged[dependent_vars_cols].copy() # Keep Y separate for now

print(f"Filtered to {df_X_numeric.shape[1]} numeric independent variables.")

# --- Imputation ---
print("\n--- Imputation ---")
missing_before = df_X_numeric.isnull().sum().sum()
print(f"Missing values in numeric features BEFORE imputation: {missing_before}")

if missing_before > 0:
    # Using Median Imputation (more robust to outliers than mean)
    imputer = SimpleImputer(strategy='median')
    df_X_imputed_np = imputer.fit_transform(df_X_numeric)
    # Convert back to DataFrame, keeping original column names and index
    df_X_imputed = pd.DataFrame(df_X_imputed_np, columns=df_X_numeric.columns, index=df_X_numeric.index)
    missing_after = df_X_imputed.isnull().sum().sum()
    print(f"Missing values in numeric features AFTER imputation: {missing_after}")
    if missing_after > 0:
        print("Warning: Imputation did not fill all missing values. Check for columns with all NaNs.")
        # Optional: Handle columns that are entirely NaN (e.g., drop them)
        cols_all_nan = df_X_imputed.columns[df_X_imputed.isna().all()].tolist()
        if cols_all_nan:
            print(f"Dropping columns that are entirely NaN: {cols_all_nan}")
            df_X_imputed.drop(columns=cols_all_nan, inplace=True)
else:
    print("No missing values found in numeric features.")
    df_X_imputed = df_X_numeric.copy() # Use original if no imputation needed

# --- Multicollinearity Check ---
print("\n--- Multicollinearity Check (VIF) ---")
# VIF requires adding a constant term
X_vif = sm.add_constant(df_X_imputed)

# Calculate VIF - can be slow for many features
try:
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_vif.columns
    print("Calculating VIF (this might take a while for many features)...")
    vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
    vif_data = vif_data[vif_data['feature'] != 'const'] # Remove constant
    vif_data = vif_data.sort_values(by='VIF', ascending=False)
    print("\nTop 15 VIF Scores:")
    print(vif_data.head(15))

    # Identify high VIF features
    high_vif_features = vif_data[vif_data['VIF'] > vif_threshold]['feature'].tolist()
    if high_vif_features:
        print(f"\nFeatures with VIF > {vif_threshold}: {len(high_vif_features)}")
        # print(high_vif_features) # Uncomment to see all high VIF features

        # --- Address Multicollinearity (Example: Dropping) ---
        # WARNING: Automatically dropping features based solely on VIF is often too simplistic.
        # Domain knowledge should guide this. Other methods include PCA or regularization.
        # This is shown as an example, consider carefully before applying.
        print(f"\nExample: Potential features to drop based on VIF > {vif_threshold}: {len(high_vif_features)}")
        # df_X_processed = df_X_imputed.drop(columns=high_vif_features)
        # print(f"Shape after dropping high VIF features: {df_X_processed.shape}")
        df_X_processed = df_X_imputed.copy() # For this example, we KEEP all features and rely on model robustness or further analysis
        print("Proceeding WITH all features for now. Consider addressing high VIF if model results are unstable.")

    else:
        print(f"\nNo features found with VIF > {vif_threshold}. Multicollinearity seems acceptable.")
        df_X_processed = df_X_imputed.copy()

except Exception as e:
    print(f"\nError calculating VIF: {e}. Skipping VIF analysis.")
    df_X_processed = df_X_imputed.copy() # Proceed without VIF check

# --- Save Processed Data (Optional but Recommended) ---
try:
    # Combine processed X and original Y for saving
    df_to_save = pd.concat([df_Y, df_X_processed], axis=1)
    df_to_save.reset_index().to_excel(output_processed_file, index=False, engine='openpyxl', na_rep='')
    print(f"\nSaved imputed (and potentially VIF-reduced) data to '{output_processed_file}'")
except Exception as e:
    print(f"\nError saving processed data: {e}")


# --- 3. Modeling ---

print("\n--- Hedonic Price Model Regression (Year-by-Year Example) ---")
# We'll run a separate regression for each house price year
# using ESG/other features from the SAME year as predictors.

results_summary = {}

for dep_var in dependent_vars_cols:
    try:
        # Extract year from dependent variable name
        year_str = dep_var.split('_')[-1]
        year = int(year_str)
        print(f"\n--- Running Regression for {dep_var} (Year {year}) ---")

        # Select target variable for this year, dropping rows where it's missing
        y = df_Y[dep_var].dropna()

        # Select corresponding independent variables for the SAME year
        # Filter df_X_processed for columns ending with the current year
        x_cols_this_year = [col for col in df_X_processed.columns if col.endswith(f'_{year}')]

        if not x_cols_this_year:
            print(f"  Skipping {year}: No independent variables found for this year.")
            continue

        # Align X with the non-missing rows of Y for this year
        X_this_year = df_X_processed.loc[y.index, x_cols_this_year] # Align rows using y's index

        # Add constant for the intercept term
        X_this_year_const = sm.add_constant(X_this_year)

        # Check for sufficient data
        if X_this_year_const.shape[0] < X_this_year_const.shape[1] + 1: # Need more rows than columns + intercept
             print(f"  Skipping {year}: Insufficient data points ({X_this_year_const.shape[0]}) for the number of predictors ({X_this_year_const.shape[1]}).")
             continue
        if X_this_year_const.shape[0] < 5: # Arbitrary minimum sample size
             print(f"  Skipping {year}: Sample size too small ({X_this_year_const.shape[0]}).")
             continue

        # Fit OLS model
        model = sm.OLS(y, X_this_year_const)
        results = model.fit()

        print(f"  Regression Summary for {year}:")
        # Print key results instead of the full summary for brevity in a loop
        print(f"    R-squared: {results.rsquared:.4f}")
        print(f"    Adj. R-squared: {results.rsquared_adj:.4f}")
        print(f"    F-statistic p-value: {results.f_pvalue:.4g}")
        # Store results if needed
        results_summary[year] = results.summary()
        # print(results.summary()) # Uncomment for full summary per year

    except ValueError as ve:
        print(f"  Skipping {year} due to ValueError (likely data issue): {ve}")
    except Exception as e:
        print(f"  An error occurred during regression for {year}: {e}")

print("\n--- Outline for Advanced Frameworks ---")

# ** A. Uncertainty Quantification in AVMs **
print("\n1. AVM with Uncertainty Quantification:")
print("   - Approach: Train advanced models (RandomForest, GradientBoosting, Neural Nets) to predict house prices.")
print("   - Features: Use lagged ESG features (e.g., ESG_2014 to predict HPI_2015), potentially macroeconomic data.")
print("   - Uncertainty: Apply methods like:")
print("     - Conformal Prediction (using libraries like `nonconformist`)")
print("     - Quantile Regression (e.g., `statsmodels.regression.quantile_regression.QuantReg` or LightGBM/XGBoost with quantile loss)")
print("     - Bootstrapping prediction intervals.")
print("     - Ensemble methods (like Random Forest inherently provide variance estimates across trees).")
# Example Snippet (Conceptual - requires feature engineering for lags):
# pseudo-code
# year_to_predict = 2022
# y_target = df_merged[f'house_price_index_{year_to_predict}']
# x_features = df_processed[[col for col in df_processed.columns if col.endswith(f'_{year_to_predict-1}')]] # Example: Use lagged features
# x_features = x_features.loc[y_target.dropna().index].dropna() # Align and drop NaNs in features
# y_target = y_target.loc[x_features.index]
# X_train, X_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.2, random_state=42)
# rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
# rf.fit(X_train, y_train)
# predictions = rf.predict(X_test)
# mse = mean_squared_error(y_test, predictions)
# print(f'   Example RF MSE for {year_to_predict}: {mse:.4f}')
# print('   Next steps: Implement conformal prediction or quantile regression for intervals.')

# ** B. Model-Driven DSS **
print("\n2. Model-Driven Decision Support System (DSS):")
print("   - Approach: Combine model outputs (Hedonic/AVM predictions, uncertainty) with ESG scores/ratings.")
print("   - Methods: Use Multi-Criteria Decision Analysis (MCDA):")
print("     - Analytic Hierarchy Process (AHP): Define criteria (price appreciation potential, ESG score, risk level), establish weights (expert input/survey), score alternatives (countries/regions).")
print("     - Scoring Models: Develop a composite score based on weighted financial and ESG factors.")
print("   - Goal: Rank investment opportunities (countries/regions) based on both financial and ESG performance/risk.")
# Example Snippet (Conceptual):
# pseudo-code
# criteria = ['Predicted Price Growth', 'ESG Governance Score', 'Climate Risk Score', 'Prediction Uncertainty']
# weights = {'Predicted Price Growth': 0.4, 'ESG Governance Score': 0.2, 'Climate Risk Score': 0.2, 'Prediction Uncertainty': -0.2} # Example weights
# # Normalize model outputs and ESG scores for each country
# # Calculate weighted score for each country
# # Rank countries based on the final DSS score

# ** C. Positive Impact Framework **
print("\n3. Positive Impact Framework:")
print("   - Approach: Assess investments based on alignment with specific ESG goals (e.g., energy efficiency targets, social housing provision).")
print("   - Metrics: Use specific ESG columns (e.g., 'renewable_energy_consumption_...', 'government_expenditure_on_education_...') alongside financial data.")
print("   - Analysis: Correlate house price changes with improvements in specific ESG metrics over time. Identify investments that yield both financial return and positive ESG impact.")
# Example Snippet (Conceptual):
# pseudo-code
# # Calculate % change in house price vs % change in 'renewable_energy_consumption_...' for countries
# # Visualize relationship, potentially using panel data regression if data structure allows.

# ** D. Spatially Enhanced Hedonic Regression **
print("\n4. Spatially Enhanced Hedonic Regression:")
print("   - Requirement: Needs additional *spatial data* (e.g., latitude/longitude for country centroids, regional identifiers, neighborhood data if available).")
print("   - Approach: Incorporate spatial lags or spatial error terms into the regression models.")
print("   - Models:")
print("     - Spatial Lag Model (SAR): Assumes price is influenced by prices in neighboring locations.")
print("     - Spatial Error Model (SEM): Assumes unobserved spatial factors are correlated across locations.")
print("     - Geographically Weighted Regression (GWR): Allows coefficients to vary across space (shows localized ESG impacts).")
print("   - Libraries: `pysal` (Python Spatial Analysis Library).")
# Example Snippet (Conceptual - requires spatial data `w`):
# pseudo-code
# from pysal.model import spreg
# # Assume 'w' is a spatial weights matrix created from geographic data
# # Assume y and X are prepared for a specific year
# model_sar = spreg.ML_Lag(y, X, w=w, name_y=dep_var, name_x=X.columns.tolist())
# print(model_sar.summary)


print("\n--- Analysis Complete ---")

Loading merged data from: MERGED_ESG_House_Prices.xlsx
Merged data loaded successfully.
Original data shape: (47, 574)
Set 'Country' as index.

Identified 16 dependent variables (starting with 'house_price_index_')
Identified 557 potential independent variables.
Filtered to 557 numeric independent variables.

--- Imputation ---
Missing values in numeric features BEFORE imputation: 1852
Missing values in numeric features AFTER imputation: 0

--- Multicollinearity Check (VIF) ---
Calculating VIF (this might take a while for many features)...


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)



Top 15 VIF Scores:
                                               feature  VIF
556             voice_and_accountability_estimate_2022  inf
4               voice_and_accountability_estimate_2012  inf
538  school_enrollment_primary_and_secondary_gross_...  inf
537  school_enrollment_primary_and_secondary_gross_...  inf
536  school_enrollment_primary_and_secondary_gross_...  inf
535  school_enrollment_primary_and_secondary_gross_...  inf
534  school_enrollment_primary_and_secondary_gross_...  inf
533  school_enrollment_primary_and_secondary_gross_...  inf
532  school_enrollment_primary_and_secondary_gross_...  inf
531  school_enrollment_primary_and_secondary_gross_...  inf
530  school_enrollment_primary_and_secondary_gross_...  inf
529  school_enrollment_primary_and_secondary_gross_...  inf
528  school_enrollment_primary_and_secondary_gross_...  inf
527  school_enrollment_primary_and_secondary_gross_...  inf
526  school_enrollment_primary_and_secondary_gross_...  inf

Features with VIF >

  return 1 - self.ssr/self.centered_tss


### *Panel Data Regression and AVM

In [12]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS, RandomEffects, PooledOLS
from linearmodels.panel import compare # For Hausman test
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time # To time potentially long operations

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
# Ignore specific linearmodels warnings if they appear frequently
warnings.filterwarnings('ignore', module='linearmodels.*')

# --- Configuration ---
# INPUT FILES
original_esg_file = '/content/ESG_indicators.csv' # Original LONG format ESG data
house_price_file = '/content/OECD_HOUSE_PRICES.csv' # WIDE format house price data
# OUTPUT FILES
panel_data_file = 'PANEL_ESG_House_Prices.xlsx'
merged_file_check = 'MERGED_CHECK_ESG_House_Prices.xlsx' # Intermediate check file

vif_threshold = 10 # Common threshold for VIF

# --- 1. Load & Prepare Original ESG Data ---
print(f"Loading original ESG data from: {original_esg_file}")
try:
    # Use flexible separator for initial read, then check columns
    df_esg_long = pd.read_csv(original_esg_file, sep='\s*[,|\t]\s*', engine='python')
    df_esg_long.columns = df_esg_long.columns.str.strip() # Clean column names
    print("Original ESG data loaded. Shape:", df_esg_long.shape)
    print("ESG columns:", df_esg_long.columns.tolist())

    # Ensure essential columns exist and clean Country
    if 'Country' not in df_esg_long.columns or 'Year' not in df_esg_long.columns:
        raise ValueError("Missing 'Country' or 'Year' column in ESG data.")
    df_esg_long['Country'] = df_esg_long['Country'].str.strip()

    # Convert Year to numeric integer
    df_esg_long = df_esg_long[pd.to_numeric(df_esg_long['Year'], errors='coerce').notna()]
    df_esg_long['Year'] = df_esg_long['Year'].astype(int)

    # Convert indicator columns to numeric
    indicator_cols = [col for col in df_esg_long.columns if col not in ['Country', 'Year']]
    for col in indicator_cols:
        df_esg_long[col] = pd.to_numeric(df_esg_long[col], errors='coerce')

except FileNotFoundError:
    print(f"Error: Original ESG file not found at '{original_esg_file}'.")
    exit()
except Exception as e:
    print(f"Error reading original ESG file '{original_esg_file}': {e}")
    exit()

# --- 2. Load & Reshape House Price Data ---
print(f"\nLoading House Price data from: {house_price_file}")
try:
    df_house_wide = pd.read_csv(house_price_file, sep='\s*[,|\t]\s*', engine='python')
    df_house_wide.columns = df_house_wide.columns.str.strip()
    print("House Price data loaded. Shape:", df_house_wide.shape)

    if 'Country' not in df_house_wide.columns:
        raise ValueError("Missing 'Country' column in House Price data.")
    df_house_wide['Country'] = df_house_wide['Country'].str.strip()

    # Identify year columns
    year_cols = [col for col in df_house_wide.columns if col.isdigit()]
    print(f"Found year columns in house price data: {year_cols}")

    # Melt from wide to long
    df_house_long = pd.melt(df_house_wide,
                            id_vars=['Country'],
                            value_vars=year_cols,
                            var_name='Year',
                            value_name='house_price_index')

    # Convert Year and house_price_index to numeric
    df_house_long['Year'] = pd.to_numeric(df_house_long['Year'], errors='coerce').astype(int)
    df_house_long['house_price_index'] = pd.to_numeric(df_house_long['house_price_index'], errors='coerce')
    df_house_long.dropna(subset=['Year', 'house_price_index'], inplace=True) # Drop rows where conversion failed
    print("House price data reshaped to long format. Shape:", df_house_long.shape)

except FileNotFoundError:
    print(f"Error: House Price file not found at '{house_price_file}'.")
    exit()
except Exception as e:
    print(f"Error reading or reshaping House Price file '{house_price_file}': {e}")
    exit()

# --- 3. Merge Datasets into Panel Format ---
print("\nMerging ESG and House Price data into panel format...")
try:
    df_panel = pd.merge(df_esg_long, df_house_long, on=['Country', 'Year'], how='inner')
    print("Merge successful. Panel data shape:", df_panel.shape)
    if df_panel.empty:
        print("Error: Merge resulted in an empty DataFrame. Check Country/Year matching.")
        exit()

    # Save intermediate merged file for checking if needed
    # df_panel.to_excel(merged_file_check, index=False, engine='openpyxl')
    # print(f"Saved intermediate merged data to {merged_file_check}")

except Exception as e:
    print(f"Error merging dataframes: {e}")
    exit()

# --- 4. Preprocessing for Panel Models ---
print("\n--- Preprocessing Panel Data ---")

# Set MultiIndex
df_panel = df_panel.set_index(['Country', 'Year']).sort_index()

# Identify dependent and independent variables
dep_var = 'house_price_index'
indep_vars = [col for col in df_panel.columns if col != dep_var]

# --- Imputation (Grouped Median) ---
missing_before = df_panel.isnull().sum().sum()
print(f"Missing values BEFORE imputation: {missing_before}")

if missing_before > 0:
    print("Applying median imputation within each country group...")
    # Important: Group by country FIRST, then impute
    df_panel_imputed = df_panel.groupby(level='Country').transform(lambda x: x.fillna(x.median()))
    # Check if any NaNs remain (can happen if a country has all NaNs for a column)
    missing_after_group = df_panel_imputed.isnull().sum().sum()
    if missing_after_group > 0:
        print(f"Missing values after GROUPED imputation: {missing_after_group}. Applying global median imputation for remaining NaNs.")
        # Apply global median for any remaining NaNs (e.g., countries with all missing data for a feature)
        global_imputer = SimpleImputer(strategy='median')
        df_panel_imputed_np = global_imputer.fit_transform(df_panel_imputed)
        df_panel_imputed = pd.DataFrame(df_panel_imputed_np, columns=df_panel.columns, index=df_panel.index)

    missing_after = df_panel_imputed.isnull().sum().sum()
    print(f"Missing values AFTER imputation: {missing_after}")
    if missing_after > 0:
        print("Warning: NaNs remain after imputation. Dropping rows with NaN in dependent var or all NaNs in features.")
        cols_all_nan = df_panel_imputed.columns[df_panel_imputed.isna().all()].tolist()
        if cols_all_nan:
             print(f"Columns entirely NaN: {cols_all_nan}")
             df_panel_imputed = df_panel_imputed.drop(columns=cols_all_nan)
             indep_vars = [col for col in df_panel_imputed.columns if col != dep_var] # Update indep_vars
        df_panel_imputed.dropna(subset=[dep_var], inplace=True) # Crucial: drop rows where Y is NaN
else:
    print("No missing values found.")
    df_panel_imputed = df_panel.copy()

# --- Feature Engineering (Lags) ---
print("\nCreating lagged variables (for AVM / dynamic models)...")
# Lag dependent variable
df_panel_imputed[f'{dep_var}_lag1'] = df_panel_imputed.groupby(level='Country')[dep_var].shift(1)
# Lag independent variables
for col in indep_vars:
    df_panel_imputed[f'{col}_lag1'] = df_panel_imputed.groupby(level='Country')[col].shift(1)

# Drop rows with NaNs introduced by lagging
df_panel_final = df_panel_imputed.dropna()
print(f"Panel data shape after creating lags and dropping NaNs: {df_panel_final.shape}")

# Define independent vars for panel models (CONTEMPORANEOUS effects)
X_panel_vars = [col for col in indep_vars if col in df_panel_final.columns] # Use original non-lagged vars
if not X_panel_vars:
    print("Error: No independent variables left after processing.")
    exit()

X_panel = df_panel_final[X_panel_vars]
Y_panel = df_panel_final[dep_var]

# Add constant for Pooled OLS / VIF check
X_panel_const = sm.add_constant(X_panel, has_constant='add')

# --- Multicollinearity Check (on contemporaneous data) ---
print("\n--- Multicollinearity Check (VIF on contemporaneous data) ---")
try:
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_panel_const.columns
    print("Calculating VIF...")
    start_time = time.time()
    vif_data["VIF"] = [variance_inflation_factor(X_panel_const.values, i)
                       for i in range(X_panel_const.shape[1])]
    end_time = time.time()
    print(f"VIF calculation took {end_time - start_time:.2f} seconds.")

    vif_data = vif_data[vif_data['feature'] != 'const']
    vif_data = vif_data.sort_values(by='VIF', ascending=False)
    print("\nTop 15 VIF Scores:")
    print(vif_data.head(15))

    high_vif_features = vif_data[vif_data['VIF'] > vif_threshold]['feature'].tolist()
    if high_vif_features:
        print(f"\nNote: {len(high_vif_features)} features with VIF > {vif_threshold} found.")
        print("High VIF is common in panel data with many related indicators per year.")
        print("Consider regularization (Lasso/Ridge), PCA, or domain expertise for feature selection if needed.")
        print("Proceeding with all features for demonstration.")
        # Example: df_X_processed = X_panel.drop(columns=high_vif_features)
    else:
        print(f"\nNo features found with VIF > {vif_threshold}.")

except Exception as e:
    print(f"\nError calculating VIF: {e}. Skipping VIF analysis.")

# --- Save Processed Panel Data (Highly Recommended) ---
try:
    df_panel_final.reset_index().to_excel(panel_data_file, index=False, engine='openpyxl', na_rep='')
    print(f"\nSaved fully processed panel data (with lags) to '{panel_data_file}'")
except Exception as e:
    print(f"\nError saving processed panel data: {e}")

# --- 5. Panel Data Modeling ---
print("\n--- Panel Data Model Estimation ---")

# Define dependent and CONTEMPORANEOUS independent variables for linearmodels
exog = sm.add_constant(X_panel) # Add constant for PooledOLS if not using formula
endog = Y_panel

# a) Pooled OLS (with Clustered Standard Errors)
print("\n--- a) Pooled OLS ---")
try:
    model_pooled = PooledOLS(endog, exog)
    # Cluster by Country (entity) - common practice
    pooled_res = model_pooled.fit(cov_type='clustered', cluster_entity=True)
    print(pooled_res)
except Exception as e:
    print(f"Error fitting Pooled OLS: {e}")
    pooled_res = None

# b) Fixed Effects (Entity - Country)
print("\n--- b) Fixed Effects (Country) ---")
try:
    # linearmodels handles the constant internally with effects
    model_fe = PanelOLS(endog, X_panel, entity_effects=True)
    fe_res = model_fe.fit(cov_type='clustered', cluster_entity=True)
    print(fe_res)
except np.linalg.LinAlgError as lae:
     print(f"Linear algebra error fitting FE (likely perfect multicollinearity within groups): {lae}")
     fe_res = None
except Exception as e:
    print(f"Error fitting Fixed Effects: {e}")
    fe_res = None

# c) Random Effects
print("\n--- c) Random Effects ---")
try:
    model_re = RandomEffects(endog, X_panel)
    re_res = model_re.fit(cov_type='clustered', cluster_entity=True)
    print(re_res)
except np.linalg.LinAlgError as lae:
     print(f"Linear algebra error fitting RE (likely perfect multicollinearity within groups): {lae}")
     re_res = None
except Exception as e:
    print(f"Error fitting Random Effects: {e}")
    re_res = None

# d) Hausman Test (Compare FE and RE)
print("\n--- d) Hausman Test (FE vs RE) ---")
if fe_res and re_res:
    try:
        # Need to refit FE without clustered errors for Hausman comparison
        fe_res_unclustered = PanelOLS(endog, X_panel, entity_effects=True).fit()
        re_res_unclustered = RandomEffects(endog, X_panel).fit()

        # Perform comparison (similar concept to Hausman)
        comparison = compare({"FE": fe_res_unclustered, "RE": re_res_unclustered})
        print(comparison)
        print("\nInterpret Hausman-like Comparison:")
        print("  - If 'P-value' (Prob > chi2) is small (e.g., < 0.05), reject RE null hypothesis.")
        print("    This suggests FE model is preferred due to correlation between effects and regressors.")
        print("  - If P-value is large, RE might be more efficient (if assumptions hold).")

    except Exception as e:
        print(f"Error performing Hausman test/comparison: {e}")
else:
    print("Skipping Hausman test as one or both models (FE/RE) failed to estimate.")

# --- 6. AVM Example (Random Forest with Lags & Basic Uncertainty) ---
print("\n--- AVM Example (Random Forest with Lags) ---")

# Define features (lagged) and target
avm_target_col = 'house_price_index'
# Use lagged ESG and lagged house price as features
avm_feature_cols = [col for col in df_panel_final.columns if col.endswith('_lag1')]

if not avm_feature_cols:
    print("Error: No lagged feature columns found for AVM.")
else:
    X_avm = df_panel_final[avm_feature_cols]
    Y_avm = df_panel_final[avm_target_col]

    print(f"AVM using {len(avm_feature_cols)} lagged features to predict {avm_target_col}.")
    print(f"AVM dataset shape: Features {X_avm.shape}, Target {Y_avm.shape}")

    # Simple Train/Test Split (Consider time-series split for real application)
    X_train, X_test, y_train, y_test = train_test_split(X_avm, Y_avm, test_size=0.25, random_state=42)
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

    # Initialize and Train Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, # Number of trees
                                     random_state=42,
                                     n_jobs=-1,       # Use all available CPU cores
                                     max_features=0.5, # Consider using a subset of features per split
                                     min_samples_leaf=5 # Prevent overfitting on small country groups
                                     )
    print("Training Random Forest model...")
    start_time = time.time()
    rf_model.fit(X_train, y_train)
    end_time = time.time()
    print(f"RF training took {end_time - start_time:.2f} seconds.")

    # Make Predictions
    y_pred_rf = rf_model.predict(X_test)

    # Evaluate
    rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
    r2_rf = r2_score(y_test, y_pred_rf)
    print(f"\nRandom Forest Performance (on test set):")
    print(f"  RMSE: {rmse_rf:.4f}")
    print(f"  R2 Score: {r2_rf:.4f}")

    # Feature Importances
    importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'feature': X_avm.columns, 'importance': importances})
    feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
    print("\nTop 15 Feature Importances (Random Forest):")
    print(feature_importance_df.head(15))

    # --- Basic Uncertainty Illustration (using tree predictions) ---
    print("\nBasic Uncertainty Illustration (Standard Deviation of Tree Predictions):")
    # Get predictions from each tree in the forest for the test set
    tree_predictions = np.array([tree.predict(X_test) for tree in rf_model.estimators_])
    # Calculate the standard deviation across tree predictions for each test sample
    prediction_std_dev = np.std(tree_predictions, axis=0)
    # Combine with mean prediction
    uncertainty_df = pd.DataFrame({
        'Actual': y_test,
        'Prediction': y_pred_rf,
        'Pred_StdDev': prediction_std_dev,
        'Lower_Approx (2std)': y_pred_rf - 2 * prediction_std_dev,
        'Upper_Approx (2std)': y_pred_rf + 2 * prediction_std_dev
    })
    print(uncertainty_df.head())
    print("NOTE: This std dev across trees is a basic heuristic for uncertainty.")
    print("      Proper methods like Conformal Prediction or Quantile Regression are recommended.")


# --- 7. Outlines for Other Frameworks ---
print("\n--- Further Framework Outlines ---")

# ** DSS (Placeholder)**
print("\n2. Model-Driven Decision Support System (DSS):")
print("   - Combine panel/AVM results (predicted growth, uncertainty) with ESG scores.")
print("   - Use MCDA (AHP, Scoring) with defined weights for financial vs. ESG criteria.")
print("   - Rank countries/regions for investment suitability.")

# ** Positive Impact (Placeholder)**
print("\n3. Positive Impact Framework:")
print("   - Select specific ESG metrics aligned with impact goals (e.g., renewables, education).")
print("   - Analyze correlation/causality between changes in these ESG metrics and house price performance over time (using panel data).")
print("   - Identify investments offering dual returns (financial + positive ESG).")

# ** Spatial Models (Placeholder)**
print("\n4. Spatially Enhanced Hedonic Regression:")
print("   - Requires geographic data (lat/lon, regions).")
print("   - Use `pysal` library.")
print("   - Models: SAR, SEM (account for spatial dependence), GWR (allow ESG impacts to vary geographically).")

print("\n--- Analysis Script Finished ---")

Loading original ESG data from: /content/ESG_indicators.csv
Original ESG data loaded. Shape: (707, 36)
ESG columns: ['Country', 'Year', 'coastal_protection', 'control_corruption_estimate', 'economic_and_social_rights_performance_score', 'electricity_production_from_coal_sources_total', 'energy_imports_net_energy_use', 'energy_intensity_level_primary_energy_mj_2017_ppp_gdp', 'energy_use_kg_oil_equivalent_per_capita', 'fertility_rate_total_births_per_woman', 'food_production_index_2014_2016_100', 'fossil_fuel_energy_consumption_total', 'gdp_growth_annual', 'gini_index', 'government_expenditure_on_education_total_government_expenditure', 'hospital_beds_per_1_000_people', 'income_share_held_by_lowest_20', 'individuals_using_the_internet_population', 'land_surface_temperature', 'level_water_stress_freshwater_withdrawal_as_a_proportion_available_freshwater_resources', 'life_expectancy_at_birth_total_years', 'literacy_rate_adult_total_people_ages_15_and_above', 'people_using_safely_managed_dr

In [16]:
!pip install mapie
!pip install linearmodels

Collecting mapie
  Downloading MAPIE-0.9.2-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn<1.6.0 (from mapie)
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading MAPIE-0.9.2-py3-none-any.whl (178 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.1/178.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, mapie
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed mapie-0.9.2 scikit-learn-1.5.2


### Improved Model

In [18]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.preprocessing import StandardScaler # StandardScaler removed for now, can be added if needed
from linearmodels.panel import PanelOLS, RandomEffects, PooledOLS
from linearmodels.panel import compare # For Hausman test
from mapie.regression import MapieRegressor # For Conformal Prediction
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time # To time potentially long operations

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', module='linearmodels.*')
warnings.filterwarnings("ignore", message="Maximum number of iterations has been exceeded.") # For potential VIF loop issue


# --- Configuration ---
original_esg_file = '/content/ESG_indicators.csv'
house_price_file = '/content/OECD_HOUSE_PRICES.csv'
panel_data_file = 'PANEL_ESG_House_Prices_Processed.xlsx'
merged_file_check = 'MERGED_CHECK_ESG_House_Prices.xlsx'

vif_threshold = 10.0
alpha_uq = 0.1

# --- [LOAD DATA, RESHAPE, MERGE, IMPUTE, LAG - Code from previous step assumed here] ---
# --- Assume df_panel_final exists and is the fully processed panel data with lags ---
# --- (Code for loading/prep/lagging is omitted for brevity, but should be run first) ---

# --- Load the processed panel data if starting from scratch ---
print(f"Loading processed panel data from: {panel_data_file}")
try:
    df_panel_final = pd.read_excel(panel_data_file, engine='openpyxl')
    # Ensure MultiIndex is set if loading from file
    if not isinstance(df_panel_final.index, pd.MultiIndex):
         if 'Country' in df_panel_final.columns and 'Year' in df_panel_final.columns:
              df_panel_final = df_panel_final.set_index(['Country', 'Year']).sort_index()
              print("Set MultiIndex from loaded file.")
         else:
             print("Warning: Could not set MultiIndex from loaded file. Check columns.")
    print("Processed panel data loaded. Shape:", df_panel_final.shape)
except FileNotFoundError:
    print(f"Error: Processed panel file not found at '{panel_data_file}'. Please run the previous steps first.")
    exit()
except Exception as e:
    print(f"Error reading processed Excel file '{panel_data_file}': {e}")
    exit()

# --- Define variables again after loading ---
dep_var = 'house_price_index'
indep_vars = [col for col in df_panel_final.columns if col != dep_var and not col.endswith('_lag1')] # Contemporaneous
X_panel_contemp_vars = indep_vars # For VIF reduction

# --- Multicollinearity Handling (Re-run or confirm based on loaded data) ---
print("\n--- Multicollinearity Check & Handling (for Panel OLS/FE/RE) ---")
X_vif_check = sm.add_constant(df_panel_final[X_panel_contemp_vars], has_constant='add')
vif_iterations = 0
max_vif_iterations = X_vif_check.shape[1]
high_vif_exists = True
dropped_features_vif = []
print("Calculating and iteratively reducing VIF...")
start_time_vif = time.time()
while high_vif_exists and vif_iterations < max_vif_iterations:
    vif_iterations += 1
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_vif_check.columns
    try:
        vif_data["VIF"] = [variance_inflation_factor(X_vif_check.values, i) for i in range(X_vif_check.shape[1])]
    except Exception as e:
        print(f"VIF calculation failed (iteration {vif_iterations}): {e}. Stopping VIF reduction.")
        break
    vif_data = vif_data[vif_data['feature'] != 'const']
    max_vif = vif_data['VIF'].max()
    if max_vif > vif_threshold:
        feature_to_drop = vif_data.sort_values('VIF', ascending=False)['feature'].iloc[0]
        print(f" Iteration {vif_iterations}: Dropping '{feature_to_drop}' (VIF: {max_vif:.2f})")
        X_vif_check = X_vif_check.drop(columns=[feature_to_drop])
        dropped_features_vif.append(feature_to_drop)
        if X_vif_check.shape[1] <= 1: high_vif_exists = False
    else:
        print(f" Iteration {vif_iterations}: Max VIF ({max_vif:.2f}) is below threshold {vif_threshold}. Stopping.")
        high_vif_exists = False
end_time_vif = time.time()
print(f"VIF reduction process took {end_time_vif - start_time_vif:.2f} seconds.")
X_panel_processed_vars = [col for col in X_vif_check.columns if col != 'const']
X_panel_processed = df_panel_final[X_panel_processed_vars]
Y_panel = df_panel_final[dep_var]
print(f"Number of features remaining after VIF reduction: {len(X_panel_processed_vars)}")
print(f"Features dropped due to high VIF: {dropped_features_vif if dropped_features_vif else 'None'}")

# --- 5. Panel Data Modeling (Using VIF-Reduced Features) ---
print("\n--- Panel Data Model Estimation (using VIF-reduced features) ---")
exog_processed = sm.add_constant(X_panel_processed, has_constant='add')
endog = Y_panel

# a) Pooled OLS
print("\n--- a) Pooled OLS (VIF-Reduced) ---")
try:
    model_pooled = PooledOLS(endog, exog_processed)
    pooled_res = model_pooled.fit(cov_type='clustered', cluster_entity=True)
    print(pooled_res)
except Exception as e: print(f"Error: {e}")

# b) Fixed Effects
print("\n--- b) Fixed Effects (Country, VIF-Reduced) ---")
try:
    model_fe = PanelOLS(endog, X_panel_processed, entity_effects=True)
    fe_res = model_fe.fit(cov_type='clustered', cluster_entity=True)
    print(fe_res)
except Exception as e: print(f"Error: {e}")

# c) Random Effects
print("\n--- c) Random Effects (VIF-Reduced) ---")
try:
    model_re = RandomEffects(endog, X_panel_processed)
    re_res = model_re.fit(cov_type='clustered', cluster_entity=True)
    print(re_res)
except Exception as e: print(f"Error: {e}")

# d) Hausman Test
print("\n--- d) Hausman Test (FE vs RE, VIF-Reduced) ---")
fe_model_fitted = 'fe_res' in locals() and fe_res is not None
re_model_fitted = 're_res' in locals() and re_res is not None
if fe_model_fitted and re_model_fitted:
    try:
        fe_res_unclustered = PanelOLS(endog, X_panel_processed, entity_effects=True).fit()
        re_res_unclustered = RandomEffects(endog, X_panel_processed).fit()
        comparison = compare({"FE": fe_res_unclustered, "RE": re_res_unclustered})
        print(comparison)
        # Interpretation notes omitted for brevity
    except Exception as e:
        print(f"Error performing Hausman test/comparison: {e}")
else:
    print("Skipping Hausman test as one or both models (FE/RE) failed to estimate.")


# --- 6. AVM (Random Forest with Lags & Conformal Prediction UQ) ---
print("\n--- AVM (Random Forest) with Conformal Prediction UQ ---")

# Define features (lagged - use ORIGINAL set before VIF drop) and target
avm_target_col = 'house_price_index'
avm_feature_cols = [col for col in df_panel_final.columns if col.endswith('_lag1')]

if not avm_feature_cols:
    print("Error: No lagged feature columns found for AVM.")
else:
    X_avm = df_panel_final[avm_feature_cols]
    Y_avm = df_panel_final[avm_target_col]
    print(f"AVM using {len(avm_feature_cols)} lagged features to predict {avm_target_col}.")

    # Split data: 60% train, 20% calibration, 20% test
    X_train_cal, X_test, y_train_cal, y_test = train_test_split(
        X_avm, Y_avm, test_size=0.20, random_state=42
    )
    X_train, X_cal, y_train, y_cal = train_test_split(
        X_train_cal, y_train_cal, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
    )
    print(f"Train shape: {X_train.shape}, Calibration shape: {X_cal.shape}, Test shape: {X_test.shape}")

    # Initialize Base Model (Random Forest)
    rf_base_model = RandomForestRegressor(n_estimators=100,
                                          random_state=42,
                                          n_jobs=-1,
                                          max_features=0.5,
                                          min_samples_leaf=5)

    # Wrap with MapieRegressor for Conformal Prediction (Split method)
    # CORRECTED INITIALIZATION: Removed C=X_cal
    mapie_reg = MapieRegressor(rf_base_model, method="plus", cv="split")

    # Fit the base model and calibration
    print("Fitting base RF model and calibrating with MAPIE...")
    start_time_mapie = time.time()
    # CORRECTED FIT: Pass calibration data here
    mapie_reg.fit(X_train, y_train, X_calib=X_cal, y_calib=y_cal)
    end_time_mapie = time.time()
    print(f"MAPIE fitting/calibration took {end_time_mapie - start_time_mapie:.2f} seconds.")

    # Predict point estimates and prediction intervals on the test set
    y_pred_mapie, y_pis = mapie_reg.predict(X_test, alpha=alpha_uq)

    # Evaluate point predictions
    rmse_mapie = np.sqrt(mean_squared_error(y_test, y_pred_mapie))
    r2_mapie = r2_score(y_test, y_pred_mapie)
    print(f"\nRandom Forest Point Prediction Performance (Test Set):")
    print(f"  RMSE: {rmse_mapie:.4f}")
    print(f"  R2 Score: {r2_mapie:.4f}")

    # Evaluate prediction intervals
    coverage = np.mean((y_test >= y_pis[:, 0, 0]) & (y_test <= y_pis[:, 1, 0]))
    avg_width = np.mean(y_pis[:, 1, 0] - y_pis[:, 0, 0])
    print(f"\nConformal Prediction Interval Performance (alpha={alpha_uq:.2f}):")
    print(f"  Target Coverage: {1 - alpha_uq:.1%}")
    print(f"  Actual Coverage (Test Set): {coverage:.1%}")
    print(f"  Average Interval Width: {avg_width:.4f}")

    # Display some predictions with intervals
    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred_mapie,
        f'Lower_{1-alpha_uq:.0%}': y_pis[:, 0, 0],
        f'Upper_{1-alpha_uq:.0%}': y_pis[:, 1, 0],
        'Interval_Width': y_pis[:, 1, 0] - y_pis[:, 0, 0]
    })
    # Sort by index to make it easier to compare if needed
    results_df = results_df.sort_index()
    print("\nSample Predictions with Conformal Intervals:")
    print(results_df.head())

    # Feature Importances (from the base model fitted within Mapie)
    # Re-fit base model on training data only for importance calculation
    rf_base_model.fit(X_train, y_train)
    importances = rf_base_model.feature_importances_
    feature_importance_df = pd.DataFrame({'feature': X_avm.columns, 'importance': importances})
    feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
    print("\nTop 15 Feature Importances (Base Random Forest for AVM):")
    print(feature_importance_df.head(15))


# --- 7. Outlines for Other Frameworks ---
# (Keep the existing outlines for DSS, Positive Impact, Spatial Models)
print("\n--- Further Framework Outlines ---")
# ... (outlines remain the same) ...

print("\n--- Analysis Script Finished ---")

Loading processed panel data from: PANEL_ESG_House_Prices_Processed.xlsx
Set MultiIndex from loaded file.
Processed panel data loaded. Shape: (650, 70)

--- Multicollinearity Check & Handling (for Panel OLS/FE/RE) ---
Calculating and iteratively reducing VIF...
 Iteration 1: Dropping 'life_expectancy_at_birth_total_years' (VIF: 32.55)
 Iteration 2: Dropping 'rule_law_estimate' (VIF: 30.99)
 Iteration 3: Max VIF (8.57) is below threshold 10.0. Stopping.
VIF reduction process took 0.26 seconds.
Number of features remaining after VIF reduction: 32
Features dropped due to high VIF: ['life_expectancy_at_birth_total_years', 'rule_law_estimate']

--- Panel Data Model Estimation (using VIF-reduced features) ---

--- a) Pooled OLS (VIF-Reduced) ---
                          PooledOLS Estimation Summary                          
Dep. Variable:      house_price_index   R-squared:                        0.5011
Estimator:                  PooledOLS   R-squared (Between):              0.4111
No. Obs