In [52]:
# Necessary Packages
# Finish updating this later
import numpy as np
import pandas as pd
from scipy.stats import norm
import os
project_root = os.path.abspath(os.getcwd())

In [None]:
from importnb import Notebook
with Notebook():
    import tare_model_IO_functions_v1_4_1 as TARE_IO
    # Gather data
    hdd_factor_lookup = TARE_IO.load_hdd_factors(project_root)
    emis_preIRA_cambium21_lookup, emis_IRA_2024_cambium22_lookup, emis_IRA_2025_2050_cambium23_lookup = TARE_IO.load_cambium_lookup(project_root)
    emis_factor_co2e_naturalGas_ton_perkWh, emis_factor_co2e_propane_ton_perkWh, emis_factor_co2e_fuelOil_ton_perkWh = TARE_IO.load_emis_factors()
    cpi_ratio_2023_2023, cpi_ratio_2023_2022, cpi_ratio_2023_2021, cpi_ratio_2023_2020, cpi_ratio_2023_2019, cpi_ratio_2023_2018, cpi_ratio_2023_2013, cpi_ratio_2023_2010, cpi_ratio_2023_2008 = TARE_IO.load_cpi_data(project_root)
    epa_scc_usd2023_per_ton = TARE_IO.load_scc(cpi_ratio_2023_2020)
    preIRA_fuel_price_lookup, iraRef_fuel_price_lookup = TARE_IO.load_fuel_price_lookups(project_root, cpi_ratio_2023_2018, cpi_ratio_2023_2019, cpi_ratio_2023_2020, cpi_ratio_2023_2021, cpi_ratio_2023_2022)
    rsMeans_national_avg = TARE_IO.load_rsMeans_national_avg(cpi_ratio_2023_2019)
    dict_heating_equipment_cost = TARE_IO.load_dict_heating_equipment_cost(project_root)
    df_puma_medianIncome = TARE_IO.load_df_puma_medianIncome(project_root, cpi_ratio_2023_2022)
    df_county_medianIncome = TARE_IO.load_df_county_medianIncome(project_root, cpi_ratio_2023_2022)
    df_state_medianIncome = TARE_IO.load_df_state_medianIncome(project_root, cpi_ratio_2023_2022)

In [53]:
menu_prompt = """
Would you like to filter for a specific state's data? Please enter one of the following:
N. I'd like to analyze all of the United States.
Y. I'd like to filter data for a specific state.
"""

city_prompt = """
To accurately characterize load profile, it is recommended to select subsets of data with >= 1000 models (~240,000 representative dwelling units).

The following cities (number of models also shown) are available for this state:
"""

city_menu_prompt = """
Would you like to filter a subset of city-level data? Please enter one of the following:
N. I'd like to analyze all of my selected state.
Y. I'd like to filter by city in the state.
"""

def get_menu_choice(prompt, choices):
    while True:
        choice = input(prompt).upper()
        if choice in choices:
            return choice
        print("Invalid option. Please try again.")

def get_state_choice(df_copy):
    while True:
        input_state = input("Which state would you like to analyze data for? Please enter the two-letter abbreviation: ").upper()
        if df_copy['in.state'].eq(input_state).any():
            return input_state
        print("Invalid state abbreviation. Please try again.")

def get_city_choice(df_copy, input_state):
    while True:
        input_cityFilter = input("Please enter the city name ONLY (e.g., Pittsburgh): ")
        city_filter = df_copy['in.city'].eq(f"{input_state}, {input_cityFilter}")
        if city_filter.any():
            return input_cityFilter
        print("Invalid city name. Please try again.")

# Baseline

In [54]:
import pandas as pd
import numpy as np
import re

def standardize_fuel_name(fuel_desc):
    # Ensure that the input is a string
    if pd.isna(fuel_desc):
        return 'Other'  # Return 'Other' for NaN values
    elif isinstance(fuel_desc, str):
        if 'Electric' in fuel_desc:
            return 'Electricity'
        elif 'Gas' in fuel_desc:
            return 'Natural Gas'
        elif 'Propane' in fuel_desc:
            return 'Propane'
        elif 'Oil' in fuel_desc:
            return 'Fuel Oil'
        else:
            return 'Other'  # For any unexpected types, categorize as 'Other'
    else:
        return 'Other'  # Non-string, non-NaN values are categorized as 'Other'

def preprocess_fuel_data(df, column_name):
    """Applies standardization to a specified column in the DataFrame."""
    print(f"Processing column: {column_name}")
    print(f"Initial data types: {df[column_name].dtype}")
    
    # Updated this portion of the code to prevent the setting with copy warning
    df.loc[:, column_name] = df[column_name].apply(standardize_fuel_name)
    
    print(f"Data types after processing: {df[column_name].dtype}")
    return df

def apply_fuel_filter(df, category, enable):
    if enable == 'Yes':
        fuel_list = ['Natural Gas', 'Electricity', 'Propane', 'Fuel Oil']
        df_filtered = df[df[f'base_{category}_fuel'].isin(fuel_list)]
        print(f"Filtered for the following fuels: {fuel_list}")
        return df_filtered
    return df

def apply_technology_filter(df, category, enable):
    """
    Applies technology filters to the dataframe based on the category and whether filtering is enabled.
    
    Parameters:
    - df: The DataFrame to filter.
    - category: The category of consumption (e.g., 'heating', 'waterHeating').
    - enable: String flag ('Yes' or 'No') indicating whether to apply the filter.
    """
    if enable == 'Yes':
        if category == 'heating':
            tech_list = [
                'Electricity ASHP', 'Electricity Baseboard', 'Electricity Electric Boiler', 'Electricity Electric Furnace',
                'Fuel Oil Fuel Boiler', 'Fuel Oil Fuel Furnace', 'Natural Gas Fuel Boiler', 'Natural Gas Fuel Furnace',
                'Propane Fuel Boiler', 'Propane Fuel Furnace'
            ]
            df_filtered = df[df['heating_type'].isin(tech_list)]
            print(f"Filtered for the following Heating technologies: {tech_list}")    
            return df_filtered
        
        elif category == 'waterHeating':
            tech_list = [
                'Electric Heat Pump, 80 gal', 'Electric Premium', 'Electric Standard',
                'Fuel Oil Premium', 'Fuel Oil Standard', 'Natural Gas Premium', 'Natural Gas Standard',
                'Propane Premium', 'Propane Standard'
            ]
            df_filtered = df[df['waterHeating_type'].isin(tech_list)]
            print(f"Filtered for the following Water Heating technologies: {tech_list}")
            return df_filtered
    
    return df

def debug_filters(df, filter_name):
    if df.empty:
        print(f"No rows left after applying {filter_name}")
    else:
        print(f"{len(df)} rows remain after applying {filter_name}")

# Function to extract city name
def extract_city_name(row):
    match = re.match(r'^[A-Z]{2}, (.+)$', row)
    return match.group(1) if match else row
        
def df_enduse_refactored(df_baseline, fuel_filter='Yes', tech_filter='Yes'):
    # Initial check
    if df_baseline.empty:
        print("Warning: Input DataFrame is empty")
        return df_baseline

    # Standardize fuel names in the base columns before creating the df_enduse
    df_baseline = preprocess_fuel_data(df_baseline, 'in.clothes_dryer')
    df_baseline = preprocess_fuel_data(df_baseline, 'in.cooking_range')

    # Map standardized names to new columns
    df_baseline['base_clothesDrying_fuel'] = df_baseline['in.clothes_dryer']
    df_baseline['base_cooking_fuel'] = df_baseline['in.cooking_range']
    
    # Initialize df_enduse from df_baseline with all required columns
    # (assuming columns are correctly listed here)
    # Create a new DataFrame named df_enduse
    # using pd.DataFrame constructor and initialize it with columns from df_baseline
    df_enduse = pd.DataFrame({
        'bldg_id': df_baseline['bldg_id'],
        'square_footage': df_baseline['in.sqft'],
        'census_region': df_baseline['in.census_region'],
        'census_division': df_baseline['in.census_division'],
        'census_division_recs': df_baseline['in.census_division_recs'],
        'building_america_climate_zone': df_baseline['in.building_america_climate_zone'],
        'reeds_balancing_area': df_baseline['in.reeds_balancing_area'],
        'state': df_baseline['in.state'],
        'city': df_baseline['in.city'].apply(extract_city_name),
        'county': df_baseline['in.county'],
        'puma': df_baseline['in.puma'],
        'county_and_puma': df_baseline['in.county_and_puma'],
        'weather_file_city': df_baseline['in.weather_file_city'],
        'Longitude': df_baseline['in.weather_file_longitude'],
        'Latitude': df_baseline['in.weather_file_latitude'],
        'building_type': df_baseline['in.geometry_building_type_recs'],
        'income': df_baseline['in.income'],
        'federal_poverty_level': df_baseline['in.federal_poverty_level'],
        'occupancy': df_baseline['in.occupants'],
        'tenure': df_baseline['in.tenure'],
        'vacancy_status': df_baseline['in.vacancy_status'],
        'base_heating_fuel': df_baseline['in.heating_fuel'],
        'heating_type': df_baseline['in.hvac_heating_type_and_fuel'],
        'hvac_cooling_type': df_baseline['in.hvac_cooling_type'],
        'vintage': df_baseline['in.vintage'],
        'base_heating_efficiency': df_baseline['in.hvac_heating_efficiency'],
        'base_electricity_heating_consumption': df_baseline['out.electricity.heating.energy_consumption.kwh'],
        'base_fuelOil_heating_consumption': df_baseline['out.fuel_oil.heating.energy_consumption.kwh'],
        'base_naturalGas_heating_consumption': df_baseline['out.natural_gas.heating.energy_consumption.kwh'],
        'base_propane_heating_consumption': df_baseline['out.propane.heating.energy_consumption.kwh'],
        'base_waterHeating_fuel': df_baseline['in.water_heater_fuel'],
        'waterHeating_type': df_baseline['in.water_heater_efficiency'],
        'base_electricity_waterHeating_consumption': df_baseline['out.electricity.hot_water.energy_consumption.kwh'],
        'base_fuelOil_waterHeating_consumption': df_baseline['out.fuel_oil.hot_water.energy_consumption.kwh'],
        'base_naturalGas_waterHeating_consumption': df_baseline['out.natural_gas.hot_water.energy_consumption.kwh'],
        'base_propane_waterHeating_consumption': df_baseline['out.propane.hot_water.energy_consumption.kwh'],
        'base_clothesDrying_fuel': df_baseline['in.clothes_dryer'],
        'base_electricity_clothesDrying_consumption': df_baseline['out.electricity.clothes_dryer.energy_consumption.kwh'],
        'base_naturalGas_clothesDrying_consumption': df_baseline['out.natural_gas.clothes_dryer.energy_consumption.kwh'],
        'base_propane_clothesDrying_consumption': df_baseline['out.propane.clothes_dryer.energy_consumption.kwh'],
        'base_cooking_fuel': df_baseline['in.cooking_range'],
        'base_electricity_cooking_consumption': df_baseline['out.electricity.range_oven.energy_consumption.kwh'],
        'base_naturalGas_cooking_consumption': df_baseline['out.natural_gas.range_oven.energy_consumption.kwh'],
        'base_propane_cooking_consumption': df_baseline['out.propane.range_oven.energy_consumption.kwh']
    })
    
    categories = ['heating', 'waterHeating', 'clothesDrying', 'cooking']
    for category in categories:
        if category == 'heating' or category == 'waterHeating':
            fuel_types = ['electricity', 'fuelOil', 'naturalGas', 'propane']
            # Calculate and update total consumption
            total_consumption = sum(df_enduse.get(f'base_{fuel}_{category}_consumption', pd.Series([], dtype=float)).fillna(0) for fuel in fuel_types)
            df_enduse[f'baseline_{category}_consumption'] = total_consumption.replace(0, np.nan)

            debug_filters(df_enduse, f"total {category} consumption calculation")

            # Apply filters
            df_enduse = apply_fuel_filter(df_enduse, category, fuel_filter)
            debug_filters(df_enduse, f"{category} fuel filter")

            df_enduse = apply_technology_filter(df_enduse, category, tech_filter)
            debug_filters(df_enduse, f"{category} technology filter")

        else:
            fuel_types = ['electricity', 'naturalGas', 'propane']
            # Calculate and update total consumption
            total_consumption = sum(df_enduse.get(f'base_{fuel}_{category}_consumption', pd.Series([], dtype=float)).fillna(0) for fuel in fuel_types)
            df_enduse[f'baseline_{category}_consumption'] = total_consumption.replace(0, np.nan)

            debug_filters(df_enduse, f"total {category} consumption calculation")

            # Apply filters
            df_enduse = apply_fuel_filter(df_enduse, category, fuel_filter)
            debug_filters(df_enduse, f"{category} fuel filter")
            
    return df_enduse

In [46]:
def project_future_consumption(df, hdd_factor_lookup, menu_mp):
    """
    Projects future energy consumption based on baseline or upgraded equipment specifications.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame containing baseline consumption data.
    hdd_factor_lookup (dict): A dictionary with Heating Degree Day (HDD) factors for different census divisions and years.
    menu_mp (int): Indicates the measure package to apply. 0 for baseline, 8/9/10 for retrofit scenarios.
    
    Returns:
    pd.DataFrame: A DataFrame with projected future energy consumption and reductions.
    """

    # Equipment lifetime specifications in years
    equipment_specs = {
        'heating': 15,
        # 'waterHeating': 12,
        # 'clothesDrying': 13,
        # 'cooking': 15
    }

    # Create a copy of the input DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Check if the 'census_division' column exists in the DataFrame
    if 'census_division' not in df_copy.columns:
        raise KeyError("'census_division' column is missing from the DataFrame")

    # Prepare a dictionary to hold new columns for projected consumption
    new_columns = {}

    # Baseline policy_scenario: Existing Equipment
    if menu_mp == 0:
        for category, lifetime in equipment_specs.items():
            print(f"Projecting Future Energy Consumption (Baseline Equipment): {category}")
            for year in range(1, lifetime + 1):
                year_label = 2023 + year

                # Adjust consumption based on HDD factors for heating and water heating
                if category in ['heating', 'waterHeating']:
                    hdd_factor = df_copy['census_division'].map(lambda x: hdd_factor_lookup.get(x, {}).get(year_label, hdd_factor_lookup['National'][year_label]))
                    new_columns[f'baseline_{year_label}_{category}_consumption'] = (df_copy[f'baseline_{category}_consumption'] * hdd_factor).round(2)

                else:
                    new_columns[f'baseline_{year_label}_{category}_consumption'] = df_copy[f'baseline_{category}_consumption'].round(2)

    # Retrofit policy_scenario: Upgraded Equipment (Measure Packages 8, 9, 10)
    else:
        for category, lifetime in equipment_specs.items():
            print(f"Projecting Future Energy Consumption (Upgraded Equipment): {category}")
            for year in range(1, lifetime + 1):
                year_label = 2023 + year

                # Adjust consumption based on HDD factors for heating and water heating
                if category in ['heating', 'waterHeating']:
                    hdd_factor = df_copy['census_division'].map(lambda x: hdd_factor_lookup.get(x, {}).get(year_label, hdd_factor_lookup['National'][year_label]))
                    new_columns[f'mp{menu_mp}_{year_label}_{category}_consumption'] = (df_copy[f'mp{menu_mp}_{category}_consumption'] * hdd_factor).round(2)

                    # Calculate the reduction in annual energy consumption
                    new_columns[f'mp{menu_mp}_{year_label}_{category}_reduction_consumption'] = df_copy[f'baseline_{year_label}_{category}_consumption'].sub(
                        new_columns[f'mp{menu_mp}_{year_label}_{category}_consumption'], axis=0, fill_value=0
                    ).round(2)
                else:
                    new_columns[f'mp{menu_mp}_{year_label}_{category}_consumption'] = df_copy[f'mp{menu_mp}_{category}_consumption'].round(2)

                    # Calculate the reduction in annual energy consumption
                    new_columns[f'mp{menu_mp}_{year_label}_{category}_reduction_consumption'] = df_copy[f'baseline_{year_label}_{category}_consumption'].sub(
                        new_columns[f'mp{menu_mp}_{year_label}_{category}_consumption'], axis=0, fill_value=0
                    ).round(2)

    # Calculate the new columns based on policy scenario and create dataframe based on df_copy index
    df_new_columns = pd.DataFrame(new_columns, index=df_copy.index)

    # Identify overlapping columns between the new and existing DataFrame.
    overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)

    # Drop overlapping columns from df_copy.
    if not overlapping_columns.empty:
        df_copy.drop(columns=overlapping_columns, inplace=True)

    # Merge new columns into df_copy, ensuring no duplicates or overwrites occur.
    df_copy = df_copy.join(df_new_columns, how='left')

    # Return the updated DataFrame.
    return df_copy

In [49]:
# LAST UPDATED SEPTEMBER 8, 2024 @ 5:26 PM
# Constants and mappings
td_losses = 0.06
td_losses_multiplier = 1 / (1 - td_losses)

equipment_specs = {
    'heating': 15#, 'waterHeating': 12, 'clothesDrying': 13, 'cooking': 15
}

def calculate_marginal_damages(df, menu_mp, policy_scenario, df_summary):
    """
    Calculate the marginal damages of different pollutants based on various conditions and mappings.
    
    Parameters:
    - df (DataFrame): The primary data frame containing pollutant emissions data and other relevant attributes.
    - menu_mp (int): Identifies measure package (retrofit being conducted)
    - policy_scenario (str): Policy scenarios that determine electricity grid projections (No Inflation Reduction Act or AEO2023 Reference Case) 

    Returns:
    - DataFrame: The updated data frame with calculated marginal damages and potentially new columns.
    
    This function processes a given DataFrame 'df' to:
    - Copy the DataFrame to avoid modification of the original data.
    - Map regional identifiers to a subregion grid.
    - Calculate the natural gas leakage factor based on state.
    - Create and calculate damage factor columns if they do not exist.
    - Depending on the flag 'grid_decarb', apply different damage calculation methods.
    - Manage and merge newly created columns to avoid duplicates and ensure data integrity.

    UPDATES:
    - Aug/Sep 2024: Focus on CO2 (EPA SCC of $190USD-2020), Cambium Dataset for Emissions, Removed CEDM and EASIUR, Emissions Lookup/Projections instead of Damages
    """

    df_copy = df.copy()

    # Determine scenario-specific settings
    if menu_mp == 0:
        scenario_prefix = "baseline_"
        cambium_scenario = 'MidCase'
        emis_electricity_lookup = emis_preIRA_cambium21_lookup
    else:
        if policy_scenario == 'No Inflation Reduction Act':
            scenario_prefix = f"preIRA_mp{menu_mp}_"
            cambium_scenario = 'MidCase'
            emis_electricity_lookup = emis_preIRA_cambium21_lookup
        elif policy_scenario == 'AEO2023 Reference Case':
            scenario_prefix = f"iraRef_mp{menu_mp}_"
            cambium_scenario = 'MidCase'
            emis_electricity_lookup = emis_IRA_2025_2050_cambium23_lookup
        else:
            raise ValueError("Invalid Policy Scenario! Please choose from 'No Inflation Reduction Act' or 'AEO2023 Reference Case'.")

    # Precompute hdd_factors for each region and year once
    hdd_factors_per_year = {
        year_label: df_copy['census_division'].map(
            lambda x: hdd_factor_lookup.get(x, hdd_factor_lookup['National']).get(year_label, 1.0)
        )
        for year_label in range(2024, 2024 + max(equipment_specs.values()) + 1)
    }

    # Calculate the new columns based on grid scenarios.
    df_new_columns = calculate_damages_grid_scenario(
        df_copy, df_summary, menu_mp, td_losses_multiplier, emis_electricity_lookup,
        policy_scenario, cambium_scenario, scenario_prefix, hdd_factors_per_year
    )

    # Drop overlapping columns and merge new data
    overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)
    if not overlapping_columns.empty:
        df_copy.drop(columns=overlapping_columns, inplace=True)

    df_copy = df_copy.join(df_new_columns, how='left')

    return df_copy


def calculate_damages_grid_scenario(df_copy, df_summary, menu_mp, td_losses_multiplier, emis_electricity_lookup, policy_scenario, cambium_scenario, scenario_prefix, hdd_factors_per_year):
    """
    Calculate damages for the specified electricity grid policy_scenario.

    Parameters:
        df_copy (DataFrame): The DataFrame containing consumption data.
        menu_mp (int): The menu number for the measure package.
        pollutants (list): List of pollutants.
        td_losses (float): Transmission and distribution losses.
        emis_electricity_lookup (dict): Lookup table for damages from CEDM or preIRA_damages_electricity_lookup.

    Returns:
        DataFrame: The DataFrame with calculated damages.
    """

    new_columns_data = {}

    if menu_mp == 0:
        for category, lifetime in equipment_specs.items():
            print(f"Calculating marginal emissions and marginal damages for {category}")
            lifetime_emissions = 0
            lifetime_damages = 0
            
            for year in range(1, lifetime + 1):
                year_label = year + 2023
                emis_col = f'baseline_{year_label}_{category}_tons_co2e'
                damage_col = f'baseline_{year_label}_{category}_damages_climate'
                
                # Get precomputed hdd_factor for the current year
                # Adjust consumption based on HDD factors for heating and water heating
                if category in ['heating', 'waterHeating']:
                    hdd_factor = hdd_factors_per_year[year_label]
                else:
                    hdd_factor = 1.0

                # Multiplied by HDD factor in Baseline Emissions Calculation but NOT IN POST-RETROFIT
                # Baseline Energy Consumption Projections only performed on total end-use energy consumption (not by fuel_type)
                # Consumption by fuel_type is needed for emissions calculations due to 
                emis_electricity = (
                    df_copy[f'base_electricity_{category}_consumption'] *
                    hdd_factor *
                    td_losses_multiplier *
                    df_copy.apply(
                        lambda row: emis_electricity_lookup.get(
                            (cambium_scenario, row['state'], row['reeds_balancing_area']), {}
                        ).get(year_label, np.nan),
                        axis=1
                    ).fillna(0)
                )
                
                # FOSSIL FUELS
                # Natural Gas Emissions 
                emis_naturalGas = df_copy[f'base_naturalGas_{category}_consumption'] * hdd_factor * emis_factor_co2e_naturalGas_ton_perkWh

                # Propane Emissions
                emis_propane = df_copy[f'base_propane_{category}_consumption'] * hdd_factor * emis_factor_co2e_propane_ton_perkWh

                if 'cooking' in category or 'clothesDrying' in category:
                    fossilFuel_emissions = emis_naturalGas.fillna(0) + emis_propane.fillna(0)

                else:
                    emis_fuelOil = df_copy[f'base_fuelOil_{category}_consumption'] * hdd_factor * emis_factor_co2e_fuelOil_ton_perkWh

                    fossilFuel_emissions = emis_naturalGas.fillna(0) + emis_propane.fillna(0) + emis_fuelOil.fillna(0)

                total_emissions = fossilFuel_emissions + emis_electricity
                total_damages = total_emissions * epa_scc_usd2023_per_ton

                new_columns_data[emis_col] = np.round(total_emissions, 2)
                new_columns_data[damage_col] = np.round(total_damages, 2)
            
                # Accumulate the emissions and damages
                lifetime_emissions += total_emissions
                lifetime_damages += total_damages

            # Columns for Lifetime (Current Scenario Equipment) and Avoided (Reductions from Baseline) Emissions and Damages
            lifetime_emissions_col = f'{scenario_prefix}{category}_lifetime_tons_co2e'
            lifetime_damages_col = f'{scenario_prefix}{category}_lifetime_damages_climate'            

            # Lifetime Emissions and Damages
            new_columns_data[lifetime_emissions_col] = np.round(lifetime_emissions, 2)
            new_columns_data[lifetime_damages_col] = np.round(lifetime_damages, 2)

            # Summary Dataframe with main model results
            df_summary[lifetime_emissions_col] = np.round(lifetime_emissions, 2)
            df_summary[lifetime_damages_col] = np.round(lifetime_damages, 2)            

    else:
        for category, lifetime in equipment_specs.items():
            print(f"Calculating marginal emissions and marginal damages for {category}")
            lifetime_emissions = 0
            lifetime_damages = 0

            for year in range(1, lifetime + 1):
                year_label = year + 2023
                consumption_col = f'mp{menu_mp}_{year_label}_{category}_consumption' # Already includes HDD projection adjustment
                emis_col = f'{scenario_prefix}{year_label}_{category}_tons_co2e'
                damage_col = f'{scenario_prefix}{year_label}_{category}_damages_climate'

                # ELECTRICITY
                # The Electricity Lookup Dictionary Depends on the Current Year 
                # Cambium began in 2025 and there are no historical data available for the ReEDS Balancing Areas 
                if policy_scenario != 'No Inflation Reduction Act':
                    if year_label == 2024:
                        emis_electricity_lookup = emis_IRA_2024_cambium22_lookup
                    else:
                        emis_electricity_lookup = emis_IRA_2025_2050_cambium23_lookup

                # POST-RETROFIT (MP) consumption includes HDD projection adjustment
                emis_electricity = (
                    df_copy[consumption_col] *
                    td_losses_multiplier *
                    df_copy.apply(
                        lambda row: emis_electricity_lookup.get(
                            (cambium_scenario, row['state'], row['reeds_balancing_area']), {}
                        ).get(year_label, np.nan),
                        axis=1
                    ).fillna(0)
                )

                total_damages = emis_electricity * epa_scc_usd2023_per_ton

                new_columns_data[emis_col] = np.round(emis_electricity, 2)
                new_columns_data[damage_col] = np.round(total_damages, 2)
            
                # Accumulate the emissions and damages
                lifetime_emissions += emis_electricity
                lifetime_damages += total_damages

            # Columns for Lifetime (Current Scenario Equipment) and Avoided (Reductions from Baseline) Emissions and Damages
            lifetime_emissions_col = f'{scenario_prefix}{category}_lifetime_tons_co2e'
            lifetime_damages_col = f'{scenario_prefix}{category}_lifetime_damages_climate'            
            avoided_emissions_col = f'{scenario_prefix}{category}_avoided_tons_co2e'
            avoided_damages_col = f'{scenario_prefix}{category}_avoided_damages_climate'

            # Lifetime Emissions and Damages
            new_columns_data[lifetime_emissions_col] = np.round(lifetime_emissions, 2)
            new_columns_data[lifetime_damages_col] = np.round(lifetime_damages, 2)

            # Avoided Emissions and Damages
            new_columns_data[avoided_emissions_col] = np.round(df_copy[f'baseline_{category}_lifetime_tons_co2e'] - new_columns_data[lifetime_emissions_col], 2)
            new_columns_data[avoided_damages_col] = np.round(df_copy[f'baseline_{category}_lifetime_damages_climate'] - new_columns_data[lifetime_damages_col], 2)

            # Summary Dataframe with main model results
            df_summary[lifetime_emissions_col] = np.round(lifetime_emissions, 2)
            df_summary[lifetime_damages_col] = np.round(lifetime_damages, 2)            
            df_summary[avoided_emissions_col] = np.round(df_copy[f'baseline_{category}_lifetime_tons_co2e'] - new_columns_data[lifetime_emissions_col], 2)
            df_summary[avoided_damages_col] = np.round(df_copy[f'baseline_{category}_lifetime_damages_climate'] - new_columns_data[lifetime_damages_col], 2)

    df_new_columns = pd.DataFrame(new_columns_data, index=df_copy.index)

    return df_new_columns
    return df_summary

In [50]:
import pandas as pd

# Define function to create a fuel price lookup dictionary without policy_scenario from row
def create_fuel_price_lookup(df, policy_scenario):
    lookup_dict = {}
    
    for _, row in df.iterrows():
        location = row['location_map']
        fuel_type = row['fuel_type']
        
        if location not in lookup_dict:
            lookup_dict[location] = {}
        
        if fuel_type not in lookup_dict[location]:
            lookup_dict[location][fuel_type] = {}
        
        if policy_scenario not in lookup_dict[location][fuel_type]:
            lookup_dict[location][fuel_type][policy_scenario] = {}
        
        for year in range(2022, 2051):
            column_name = f"{year}_fuelPrice_perkWh"
            lookup_dict[location][fuel_type][policy_scenario][year] = row[column_name]
    
    return lookup_dict

# Define function to project future prices with fallback to 'National'
def project_future_prices(row, factor_dict, policy_scenario):
    loc = row['census_division']
    fuel = row['fuel_type']
    price_2022 = row['2022_fuelPrice_perkWh']

    print(f"\nProcessing location: {loc}, fuel: {fuel}, policy_scenario: {policy_scenario}")
    print(f"Initial price for 2022: {price_2022}")

    # First, try to fetch the projection factors for the specific region
    projection_factors = factor_dict.get((loc, fuel, policy_scenario))
    
    # If no factors are found for the specific region, default to 'National'
    if not projection_factors:
        print(f"No projection factors found for {loc}, {fuel}, {policy_scenario}. Defaulting to 'National'.")
        projection_factors = factor_dict.get(('National', fuel, policy_scenario))
        
    if projection_factors:
        print(f"Using projection factors for {loc if projection_factors else 'National'}, {fuel}, {policy_scenario}: {projection_factors}")
    else:
        print(f"No projection factors found for 'National', {fuel}, {policy_scenario} either. Cannot project future prices.")
        return pd.Series()  # Return an empty Series if no factors are found

    future_prices = {}
    for year in range(2022, 2051):
        if projection_factors and year in projection_factors:
            factor = projection_factors[year]
            future_price = price_2022 * factor
            future_prices[f'{year}_fuelPrice_perkWh'] = future_price
            print(f"Year: {year}, Factor: {factor}, Future Price: {future_price}")
        else:
            print(f"Missing factor for year {year} in {loc if projection_factors else 'National'}, {fuel}, {policy_scenario}. Skipping this year.")
    
    return pd.Series(future_prices)

In [51]:
# LAST UPDATED SEPTEMBER 5, 2024 @ 9:37 PM
def calculate_annual_fuelCost(df, menu_mp, policy_scenario, drop_fuel_cost_columns):
    """
    Calculate the annual fuel cost for baseline and measure packages.

    Parameters:
    df (pd.DataFrame): DataFrame containing baseline fuel consumption data.
    menu_mp (int): Measure package identifier
    policy_scenario (str): Name of EIA AEO policy_scenario used to project fuel prices

    Returns:
    pd.DataFrame: DataFrame with additional columns for annual fuel costs, savings, and changes.
    """
    df_copy = df.copy()

    # Determine the scenario prefix and fuel price lookup based on menu_mp and policy_scenario
    if menu_mp == 0:
        scenario_prefix = "baseline_"
        fuel_price_lookup = preIRA_fuel_price_lookup
    else:
        if policy_scenario == 'No Inflation Reduction Act':
            scenario_prefix = f"preIRA_mp{menu_mp}_"
            fuel_price_lookup = preIRA_fuel_price_lookup
        elif policy_scenario == 'AEO2023 Reference Case':
            scenario_prefix = f"iraRef_mp{menu_mp}_"
            fuel_price_lookup = iraRef_fuel_price_lookup
        else:
            raise ValueError("Invalid Policy policy_scenario! Please choose from 'No Inflation Reduction Act' or 'AEO2023 Reference Case'.")

    # Fuel type mapping and equipment lifetime specifications
    fuel_mapping = {'Electricity': 'electricity', 'Natural Gas': 'naturalGas', 'Fuel Oil': 'fuelOil', 'Propane': 'propane'}
    equipment_specs = {'heating': 15}#, 'waterHeating': 12, 'clothesDrying': 13, 'cooking': 15}

    # Initialize a dictionary to hold new columns
    new_columns = {}

    # If baseline calculations are required
    if menu_mp == 0:
        for category in equipment_specs:
            df_copy[f'fuel_type_{category}'] = df_copy[f'base_{category}_fuel'].map(fuel_mapping)

        for category, lifetime in equipment_specs.items():
            print(f"Calculating BASELINE (no retrofit) fuel costs from 2024 to {2024 + lifetime} for {category}")
            for year in range(1, lifetime + 1):
                year_label = year + 2023

                fuel_costs = df_copy.apply(lambda row: round(
                    row[f'baseline_{year_label}_{category}_consumption'] *
                    fuel_price_lookup.get(
                        row['state'] if row[f'fuel_type_{category}'] in ['electricity', 'naturalGas'] else row['census_division'],
                        {}
                    ).get(row[f'fuel_type_{category}'], {}).get(policy_scenario, {}).get(year_label, 0), 2),
                    axis=1
                )

                new_columns[f'baseline_{year_label}_{category}_fuelCost'] = fuel_costs

    else:
        for category, lifetime in equipment_specs.items():
            print(f"Calculating POST-RETROFIT (MP{menu_mp}) fuel costs from 2024 to {2024 + lifetime} for {category}")
            for year in range(1, lifetime + 1):
                year_label = year + 2023

                fuel_costs = df_copy.apply(lambda row: round(
                    row[f'mp{menu_mp}_{year_label}_{category}_consumption'] *
                    fuel_price_lookup.get(row['state'], {}).get('electricity', {}).get(policy_scenario, {}).get(year_label, 0), 2),
                    axis=1
                )

                # Store all new columns in the dictionary first
                new_columns[f'{scenario_prefix}{year_label}_{category}_fuelCost'] = fuel_costs
                
                new_columns[f'{scenario_prefix}{year_label}_{category}_savings_fuelCost'] = (
                    df_copy[f'baseline_{year_label}_{category}_fuelCost'] - fuel_costs
                )

        # Only drop if annual fuel cost savings have already been calculated
        # Drop fuel cost columns if the flag is True
        if drop_fuel_cost_columns:
            print("Dropping Annual Fuel Costs for Baseline Scenario and Retrofit. Storing Fuel Savings for Private NPV Calculation.")
            fuel_cost_columns = [col for col in df_copy.columns if '_fuelCost' in col and '_savings_fuelCost' not in col]
            df_copy.drop(columns=fuel_cost_columns, inplace=True)

    # Calculate the new columns based on policy scenario and create dataframe based on df_copy index
    df_new_columns = pd.DataFrame(new_columns, index=df_copy.index)

    # Identify overlapping columns between the new and existing DataFrame.
    overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)

    # Drop overlapping columns from df_copy.
    if not overlapping_columns.empty:
        df_copy.drop(columns=overlapping_columns, inplace=True)

    # Merge new columns into df_copy, ensuring no duplicates or overwrites occur.
    df_copy = df_copy.join(df_new_columns, how='left')

    # Return the updated DataFrame.
    return df_copy

In [None]:
def load_df_county_medianIncome(project_root, cpi_ratio_2023_2022):
    # Collect Area Median Income Data at county-resolution
    filename = "nhgis0005_ds261_2022_county.csv"
    relative_path = os.path.join(r"equity_data", filename)
    file_path = os.path.join(project_root, relative_path)

    print(f"Retrieved data for filename: {filename}")
    print(f"Located at filepath: {file_path}")
    print("\n")

    df_county_medianIncome = pd.read_csv(file_path, encoding='ISO-8859-1')
    # df_county_medianIncome = df_county_medianIncome.drop(0)
    df_county_medianIncome = df_county_medianIncome.reset_index(drop=True)

    cols_interest = ['GISJOIN', 'STUSAB', 'COUNTYA', 'NAME_E', 'AP2PE001', 'AP2PM001']
    df_county_medianIncome = df_county_medianIncome[cols_interest]
    df_county_medianIncome = df_county_medianIncome.rename(columns={"GISJOIN": "gis_joinID_county", "STUSAB": "state_abbrev", "COUNTYA": "county_code", "NAME_E": "name_estimate", "AP2PE001": "median_income_USD2022", "AP2PM001": "median_income_USD2022_marginOfError"})
    df_county_medianIncome['median_income_USD2023'] = round((df_county_medianIncome['median_income_USD2022'] * cpi_ratio_2023_2022), 2)
    return df_county_medianIncome

# Retrofit Packages

## Basic Retrofit

In [55]:
def df_enduse_compare(df_mp, input_mp, menu_mp, df_baseline):
    # Create a new DataFrame named df_compare
    # using pd.DataFrame constructor and initialize it with columns from df_mp
    df_compare = pd.DataFrame({
        'bldg_id':df_mp['bldg_id'],
        'hvac_has_ducts': df_mp['in.hvac_has_ducts'],
        'baseline_heating_type': df_mp['in.hvac_heating_type_and_fuel'],
        'hvac_heating_efficiency': df_mp['in.hvac_heating_efficiency'],
        'hvac_heating_type_and_fuel': df_mp['in.hvac_heating_type_and_fuel'],
        'size_heat_pump_backup_primary_k_btu_h': df_mp['out.params.size_heat_pump_backup_primary_k_btu_h'],
        'size_heating_system_primary_k_btu_h': df_mp['out.params.size_heating_system_primary_k_btu_h'],
        'size_heating_system_secondary_k_btu_h': df_mp['out.params.size_heating_system_secondary_k_btu_h'],
        'upgrade_hvac_heating_efficiency': df_mp['upgrade.hvac_heating_efficiency'],
        'water_heater_efficiency': df_mp['in.water_heater_efficiency'],
        'water_heater_fuel': df_mp['in.water_heater_fuel'],
        'water_heater_in_unit': df_mp['in.water_heater_in_unit'],
        'size_water_heater_gal': df_mp['out.params.size_water_heater_gal'],
        'upgrade_water_heater_efficiency': df_mp['upgrade.water_heater_efficiency'],
        'clothes_dryer_in_unit': df_mp['in.clothes_dryer'],
        'upgrade_clothes_dryer': df_mp['upgrade.clothes_dryer'],
        'cooking_range_in_unit': df_euss_am_mp7['in.cooking_range'],
        'upgrade_cooking_range': df_euss_am_mp7['upgrade.cooking_range']
    })
    
    categories = ['heating', 'waterHeating', 'clothesDrying', 'cooking']
    for category in categories:
        if category == 'heating':
            # Heating Dataframe
            # MP9 = MP8 (Electrification, High Efficiency) + MP1 (Basic Enclosure)
            if input_mp == 'upgrade09':
                menu_mp = 9
                df_compare[f'mp{menu_mp}_heating_consumption'] = df_mp['out.electricity.heating.energy_consumption.kwh'].round(2)

                # Measure Package 1: Basic Enclosure Package
                # Attic floor insulation (upgrade.insulation_ceiling)
                df_compare['base_insulation_atticFloor'] = df_mp['in.insulation_ceiling']
                df_compare['upgrade_insulation_atticFloor'] = df_mp['upgrade.insulation_ceiling']
                df_compare['out_params_floor_area_attic_ft_2'] = df_mp['out.params.floor_area_attic_ft_2']

                # Air leakage reduction (upgrade.infiltration_reduction == '30%')
                df_compare['upgrade_infiltration_reduction'] = df_mp['upgrade.infiltration_reduction']

                # Duct sealing (upgrade.ducts == '10% Leakage, R-8')            
                df_compare['base_ducts'] = df_mp['in.ducts']
                df_compare['upgrade_duct_sealing'] = df_mp['upgrade.ducts']
                df_compare['out_params_duct_unconditioned_surface_area_ft_2'] = df_mp['out.params.duct_unconditioned_surface_area_ft_2']

                # Drill-and-fill wall insulation (upgrade.insulation_wall == 'Wood Stud, R-13')
                df_compare['base_insulation_wall'] = df_mp['in.insulation_wall']
                df_compare['upgrade_insulation_wall'] = df_mp['upgrade.insulation_wall']
                df_compare['out_params_wall_area_above_grade_exterior_ft_2'] = df_mp['out.params.wall_area_above_grade_exterior_ft_2']

            # MP8 = MP8 (Electrification, High Efficiency) + MP2 (Enhanced Enclosure)
            elif input_mp == 'upgrade10':
                menu_mp = 10
                df_compare[f'mp{menu_mp}_heating_consumption'] = df_mp['out.electricity.heating.energy_consumption.kwh'].round(2)

                # Measure Package 1: Basic Enclosure Package
                # Attic floor insulation (upgrade.insulation_ceiling)
                df_compare['base_insulation_atticFloor'] = df_mp['in.insulation_ceiling']
                df_compare['upgrade_insulation_atticFloor'] = df_mp['upgrade.insulation_ceiling']
                df_compare['out_params_floor_area_attic_ft_2'] = df_mp['out.params.floor_area_attic_ft_2']

                # Air leakage reduction (upgrade.infiltration_reduction == '30%')
                df_compare['upgrade_infiltration_reduction'] = df_mp['upgrade.infiltration_reduction']

                # Duct sealing (upgrade.ducts == '10% Leakage, R-8')                        
                df_compare['base_ducts'] = df_mp['in.ducts']
                df_compare['upgrade_duct_sealing'] = df_mp['upgrade.ducts']
                df_compare['out_params_duct_unconditioned_surface_area_ft_2'] = df_mp['out.params.duct_unconditioned_surface_area_ft_2']

                # Drill-and-fill wall insulation (upgrade.insulation_wall == 'Wood Stud, R-13')
                df_compare['base_insulation_wall'] = df_mp['in.insulation_wall']
                df_compare['upgrade_insulation_wall'] = df_mp['upgrade.insulation_wall']
                df_compare['out_params_wall_area_above_grade_exterior_ft_2'] = df_mp['out.params.wall_area_above_grade_exterior_ft_2']

                # Measure Package 2: Enhanced Enclosure Package
                # Foundation wall insulation and rim joist insulation
                df_compare['base_foundation_type'] = df_mp['in.geometry_foundation_type']
                df_compare['base_insulation_foundation_wall'] = df_mp['in.insulation_foundation_wall']
                df_compare['base_insulation_rim_joist'] = df_mp['in.insulation_rim_joist']

                # Only upgrade column for foundation wall insulation, but we will assume technical documentation and modeling consistent
                df_compare['upgrade_insulation_foundation_wall'] = df_mp['upgrade.insulation_foundation_wall']
                df_compare['out_params_floor_area_foundation_ft_2'] = df_mp['out.params.floor_area_foundation_ft_2']
                df_compare['out_params_rim_joist_area_above_grade_exterior_ft_2'] = df_mp['out.params.rim_joist_area_above_grade_exterior_ft_2']                        

                # Seal Vented Crawl Space
                df_compare['upgrade_seal_crawlspace'] = df_mp['upgrade.geometry_foundation_type']

                # Insulate finished attics and cathedral ceilings
                df_compare['base_insulation_roof'] = df_mp['in.insulation_roof']
                df_compare['upgrade_insulation_roof'] = df_mp['upgrade.insulation_roof']
                df_compare['out_params_roof_area_ft_2'] = df_mp['out.params.roof_area_ft_2']
            
            else:
                df_compare[f'mp{menu_mp}_heating_consumption'] = df_mp['out.electricity.heating.energy_consumption.kwh'].round(2)
        # Water Heating Dataframe    
        elif category == 'waterHeating':
            df_compare[f'mp{menu_mp}_waterHeating_consumption'] = df_mp['out.electricity.hot_water.energy_consumption.kwh'].round(2)

        # Clothes Drying Dataframe
        elif category == 'clothesDrying':
            df_compare[f'mp{menu_mp}_clothesDrying_consumption'] = df_mp['out.electricity.clothes_dryer.energy_consumption.kwh'].round(2)

        # Cooking Dataframe
        elif category == 'cooking':
            df_compare[f'mp{menu_mp}_cooking_consumption'] = df_euss_am_mp7['out.electricity.range_oven.energy_consumption.kwh'].round(2)
            
    # Merge dataframes on bldg id column so everything is lined up
    df_compare = pd.merge(df_baseline, df_compare, how='inner', on = 'bldg_id')
    # calculate_consumption_reduction(df_compare, category)    
        
    return df_compare

In [None]:
# import pandas as pd
# import numpy as np

# def summarize_stats_table(df, category, data_columns, column_name_mapping, number_formatting, include_zero=True):
#     """
#     Generate a formatted summary statistics table for specified columns in a DataFrame, grouped by 'base_fuel' and 'lowModerateIncome_designation'.

#     Parameters:
#     - df (DataFrame): The input DataFrame from which to compute statistics.
#     - data_columns (list of str): The columns to include in the summary statistics.
#     - column_name_mapping (dict): A dictionary to rename the columns in the summary statistics output.
#     - number_formatting (str): The format string to use for numeric values in the output.
#     - include_zero (bool, optional): Whether to include zero values in the statistics. Defaults to True.
#       If False, zeros are replaced with NaN, which are then ignored in the computations.

#     Returns:
#     - DataFrame: A DataFrame containing the summary statistics, with formatted numeric values
#       and renamed columns according to the input specifications, grouped by 'base_fuel' and 'lowModerateIncome_designation'.
#     """

#     # Ensure 'lowModerateIncome_designation' is treated as a categorical variable with a specific order
#     income_categories = ['Low-Income', 'Moderate-Income', 'Middle-to-Upper-Income']
#     df['lowModerateIncome_designation'] = pd.Categorical(df['lowModerateIncome_designation'], categories=income_categories, ordered=True)

#     # Filter out the 'Middle-to-Upper-Income' rows if needed (similar to your earlier function)
#     df_filtered = df[df['lowModerateIncome_designation'] != 'Middle-to-Upper-Income']

#     # Replace 0 values with NaN in the selected columns if include_zero is set to False
#     if not include_zero:
#         df_filtered[data_columns] = df_filtered[data_columns].replace(0, np.nan)

#     # Group by 'base_fuel' and 'lowModerateIncome_designation' and calculate summary statistics
#     summary_stats = df_filtered.groupby(by=[f'base_{category}_fuel', 'lowModerateIncome_designation'], observed=False)[data_columns].describe().unstack()

#     # # Apply formatting to each number in these statistics according to the given format
#     # summary_stats = summary_stats.applymap(lambda x: f"{x:{number_formatting}}" if pd.notnull(x) else "")

#     # Rename the columns in the summary statistics DataFrame according to the provided mapping
#     summary_stats.rename(columns=column_name_mapping, inplace=True)

#     return summary_stats

# # Example usage of the function:
# # Assume 'df' is a DataFrame with relevant data and columns:
# df_multiIndex_summary = summarize_stats_table(df_basic_summary_heating, category='heating', data_columns=['iraRef_heating_usd2023_per_mtCO2e'], 
#                                                column_name_mapping={'iraRef_heating_usd2023_per_mtCO2e': 'CO2 Abatement Cost (USD/mtCO2e)'},
#                                                number_formatting=".2f", include_zero=True)
# df_multiIndex_summary

In [56]:
def summarize_stats_table(df, data_columns, column_name_mapping, number_formatting, include_zero=True):
    """
    Generate a formatted summary statistics table for specified columns in a DataFrame.

    Parameters:
    - df (DataFrame): The input DataFrame from which to compute statistics.
    - data_columns (list of str): The columns to include in the summary statistics.
    - column_name_mapping (dict): A dictionary to rename the columns in the summary statistics output.
    - number_formatting (str): The format string to use for numeric values in the output.
    - include_zero (bool, optional): Whether to include zero values in the statistics. Defaults to True.
      If False, zeros are replaced with NaN, which are then ignored in the computations.

    Returns:
    - DataFrame: A DataFrame containing the summary statistics, with formatted numeric values
      and renamed columns according to the input specifications.
    """

    # Create a copy of the DataFrame to avoid modifying the original data
    df_copy = df.copy()

    # Replace 0 values with NaN in the selected columns if include_zero is set to False
    if not include_zero:
        df_copy[data_columns] = df_copy[data_columns].replace(0, np.nan)

    # Compute summary statistics for the selected columns
    # The 'describe' function returns summary statistics including count, mean, std, min, 25%, 50%, 75%, max
    # Apply formatting to each number in these statistics according to the given format
    summary_stats = df_copy[data_columns].describe().apply(lambda col: col.map(lambda x: f"{x:{number_formatting}}"))

    # Rename the columns in the summary statistics DataFrame according to the provided mapping
    summary_stats.rename(columns=column_name_mapping, inplace=True)

    return summary_stats

In [57]:
# # LAST UPDATE AUGUST 21, 2024 at 10:54 PM
# def calculate_public_npv(df, df_damages, menu_mp, policy_scenario, interest_rate=0.02):
#     """
#     Calculate the public Net Present Value (NPV) for specific categories of damages,
#     considering different policy scenarios related to grid decarbonization.

#     Parameters:
#     - df (DataFrame): A pandas DataFrame containing the relevant data.
#     - menu_mp (str): Menu identifier used in column names.
#     - policy_scenario (str): Policy policy_scenario that determines electricity grid projections. 
#                              Accepted values: 'AEO2023 Reference Case', 'High Uptake of Inflation Reduction Act'.
#     - interest_rate (float): The discount rate used in the NPV calculation. Default is 2% for Social Discount Rate.

#     Returns:
#     - DataFrame: The input DataFrame with additional columns containing the calculated public NPVs for each category.
#     """
#     equipment_specs = {
#         'heating': 15,
#         'waterHeating': 12,
#         'clothesDrying': 13,
#         'cooking': 15
#     }
    
#     df_copy = df.copy()
#     df_damages_copy = df_damages.copy()

#     # Calculate the lifetime damages and corresponding NPV based on the policy policy_scenario
#     df_new_columns = calculate_lifetime_damages_grid_scenario(df_copy, df_damages_copy, menu_mp, equipment_specs, policy_scenario, interest_rate)

#     # Drop any overlapping columns from df_copy
#     overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)
#     if not overlapping_columns.empty:
#         df_copy.drop(columns=overlapping_columns, inplace=True)

#     # Merge new columns into the original DataFrame
#     df_copy = df_copy.join(df_new_columns, how='left')

#     return df_copy

# def calculate_lifetime_damages_grid_scenario(df_copy, df_damages_copy, menu_mp, equipment_specs, policy_scenario, interest_rate):
#     """
#     Calculate the NPV of climate, health, and public damages over the equipment's lifetime
#     under different grid decarbonization scenarios.

#     Parameters:
#     - df_copy (DataFrame): A copy of the original DataFrame to store NPV calculations.
#     - menu_mp (str): Menu identifier used in column names.
#     - equipment_specs (dict): Dictionary containing lifetimes for each equipment category.
#     - policy_scenario (str): Specifies the grid policy_scenario ('No Inflation Reduction Act', 'AEO2023 Reference Case', 'High Uptake of Inflation Reduction Act').
#     - interest_rate (float): Discount rate for NPV calculation.

#     Returns:
#     - DataFrame: A DataFrame containing the calculated NPV values for each category.
#     """
#     # Determine the policy_scenario prefix based on the policy policy_scenario
#     if policy_scenario == 'No Inflation Reduction Act':
#         scenario_prefix = f"preIRA_mp{menu_mp}_"
#     elif policy_scenario == 'AEO2023 Reference Case':
#         scenario_prefix = f"iraRef_mp{menu_mp}_"
#     elif policy_scenario == 'High Uptake of Inflation Reduction Act':
#         scenario_prefix = f"iraHigh_mp{menu_mp}_"
#     else:
#         raise ValueError("Invalid Policy policy_scenario! Please choose from 'No Inflation Reduction Act', 'AEO2023 Reference Case', or 'High Uptake of Inflation Reduction Act'.")
    
#     # Create a DataFrame to hold the NPV calculations
#     npv_columns = {}
    
#     for category, lifetime in equipment_specs.items():
#         print(f"""\nCalculating Public NPV for {category}...
#               lifetime: {lifetime}, interest_rate: {interest_rate}, policy_scenario: {policy_scenario}""")
#         # Initialize NPV columns for each category
#         climate_npv_key = f'{scenario_prefix}{category}_climate_npv'
#         health_npv_key = f'{scenario_prefix}{category}_health_npv'
#         public_npv_key = f'{scenario_prefix}{category}_public_npv'
        
#         # Initialize NPV columns in the dictionary if they don't exist
#         npv_columns[climate_npv_key] = npv_columns.get(climate_npv_key, 0)
#         npv_columns[health_npv_key] = npv_columns.get(health_npv_key, 0)
#         npv_columns[public_npv_key] = npv_columns.get(public_npv_key, 0)
            
#         for year in range(1, lifetime + 1):
#             year_label = year + 2021
            
#             base_climate = df_damages_copy[f'baseline_{year_label}_{category}_damages_climate']
#             base_health = df_damages_copy[f'baseline_{year_label}_{category}_damages_health']
            
#             retrofit_climate = df_damages_copy[f'{scenario_prefix}{year_label}_{category}_damages_climate']
#             retrofit_health = df_damages_copy[f'{scenario_prefix}{year_label}_{category}_damages_health']
            
#             base_damages = base_climate + base_health
#             retrofit_damages = retrofit_climate + retrofit_health
            
#             # Apply the discount factor to each year's damages
#             discount_factor = 1 / ((1 + interest_rate) ** year)
                
#             npv_columns[climate_npv_key] += ((base_climate - retrofit_climate) * discount_factor).round(2)
#             npv_columns[health_npv_key] += ((base_health - retrofit_health) * discount_factor).round(2)
#             npv_columns[public_npv_key] += ((base_damages - retrofit_damages) * discount_factor).round(2)
    
#     # Convert the dictionary to a DataFrame and return it
#     npv_df = pd.DataFrame(npv_columns, index=df_copy.index)
#     return npv_df

In [None]:
# LAST UPDATED SEPTEMBER 5, 2024 @ 9:27 PM
def calculate_public_npv(df, df_damages, menu_mp, policy_scenario, interest_rate=0.02):
    """
    Calculate the public Net Present Value (NPV) for specific categories of damages,
    considering different policy scenarios related to grid decarbonization.

    Parameters:
    - df (DataFrame): A pandas DataFrame containing the relevant data.
    - menu_mp (str): Menu identifier used in column names.
    - policy_scenario (str): Policy policy_scenario that determines electricity grid projections. 
                             Accepted values: 'AEO2023 Reference Case'.
    - interest_rate (float): The discount rate used in the NPV calculation. Default is 2% for Social Discount Rate.

    Returns:
    - DataFrame: The input DataFrame with additional columns containing the calculated public NPVs for each enduse.
    """
    equipment_specs = {
        'heating': 15,
        # 'waterHeating': 12,
        # 'clothesDrying': 13,
        # 'cooking': 15
    }
    
    df_copy = df.copy()
    df_damages_copy = df_damages.copy()

    # Calculate the lifetime damages and corresponding NPV based on the policy policy_scenario
    df_new_columns = calculate_lifetime_damages_grid_scenario(df_copy, df_damages_copy, menu_mp, equipment_specs, policy_scenario, interest_rate)

    # Drop any overlapping columns from df_copy
    overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)
    if not overlapping_columns.empty:
        df_copy.drop(columns=overlapping_columns, inplace=True)

    # Merge new columns into the original DataFrame
    df_copy = df_copy.join(df_new_columns, how='left')

    return df_copy

def calculate_lifetime_damages_grid_scenario(df_copy, df_damages_copy, menu_mp, equipment_specs, policy_scenario, interest_rate):
    """
    Calculate the NPV of climate, health, and public damages over the equipment's lifetime
    under different grid decarbonization scenarios.

    Parameters:
    - df_copy (DataFrame): A copy of the original DataFrame to store NPV calculations.
    - menu_mp (str): Menu identifier used in column names.
    - equipment_specs (dict): Dictionary containing lifetimes for each equipment category.
    - policy_scenario (str): Specifies the grid policy_scenario ('No Inflation Reduction Act', 'AEO2023 Reference Case').
    - interest_rate (float): Discount rate for NPV calculation.

    Returns:
    - DataFrame: A DataFrame containing the calculated NPV values for each category.
    """
    # Determine the policy_scenario prefix based on the policy policy_scenario
    if policy_scenario == 'No Inflation Reduction Act':
        scenario_prefix = f"preIRA_mp{menu_mp}_"
    elif policy_scenario == 'AEO2023 Reference Case':
        scenario_prefix = f"iraRef_mp{menu_mp}_"
    else:
        raise ValueError("Invalid Policy policy_scenario! Please choose from 'No Inflation Reduction Act' or 'AEO2023 Reference Case'.")
    
    # Create a DataFrame to hold the NPV calculations
    npv_columns = {}
    
    for category, lifetime in equipment_specs.items():
        print(f"""\nCalculating Public NPV for {category}...
              lifetime: {lifetime}, interest_rate: {interest_rate}, policy_scenario: {policy_scenario}""")
        
        # Initialize NPV columns for each category
        public_npv_key = f'{scenario_prefix}{category}_public_npv'
        
        # Initialize NPV columns in the dictionary if they don't exist
        npv_columns[public_npv_key] = npv_columns.get(public_npv_key, 0)
            
        for year in range(1, lifetime + 1):
            year_label = year + 2023
            
            base_climate_damages = df_damages_copy[f'baseline_{year_label}_{category}_damages_climate']
            
            retrofit_climate_damages = df_damages_copy[f'{scenario_prefix}{year_label}_{category}_damages_climate']
            
            # Apply the discount factor to each year's damages
            discount_factor = 1 / ((1 + interest_rate) ** year)
                
            npv_columns[public_npv_key] += ((base_climate_damages - retrofit_climate_damages) * discount_factor).round(2)
    
    # Convert the dictionary to a DataFrame and return it
    npv_df = pd.DataFrame(npv_columns, index=df_copy.index)
    return npv_df

In [15]:
# Use CCI to adjust for cost differences when compared to the national average
# Function to map city to its average cost
def map_average_cost(city):
    if city in average_cost_map:
        return average_cost_map[city]
    elif city == 'Not in a census Place' or city == 'In another census Place':
        return average_cost_map.get('+30 City Average')
    else:
        return average_cost_map.get('+30 City Average')

In [16]:
def obtain_heating_system_specs(df):
    # Check if necessary columns are in the DataFrame
    necessary_columns = ['size_heating_system_primary_k_btu_h', 'size_heat_pump_backup_primary_k_btu_h',
                         'size_heating_system_secondary_k_btu_h', 'baseline_heating_type']
    if not all(column in df.columns for column in necessary_columns):
        raise ValueError("DataFrame does not contain all necessary columns.")

    # Total heating load in kBtuh
    df['total_heating_load_kBtuh'] = df['size_heating_system_primary_k_btu_h'] + df['size_heat_pump_backup_primary_k_btu_h'] + df['size_heating_system_secondary_k_btu_h']
    
#     # Total heating load in kW
#     df['total_heating_load_kW'] = df['total_heating_load_kBtuh'] * 1000 / 3412.142
   
    # Use regex to remove the fuel and leave only the heating type:
    df['baseline_heating_type'] = df['baseline_heating_type'].str.extract(r'^(?:\d+\s+)?(?:Natural Gas|Electricity|Propane|Fuel Oil|Fuel)\s+(?:Fuel\s+)?(?:Electric\s+)?(.+)$')
    
    # AFUE extraction for existing, baseline equipment (Replacement Costs)
    df['baseline_AFUE'] = df['hvac_heating_efficiency'].str.extract(r'([\d.]+)%').astype(float)
    
    # SEER extraction for existing, baseline equipment (Replacement Costs)
    df['baseline_SEER'] = df['hvac_heating_efficiency'].str.extract(r'SEER ([\d.]+)').astype(float)
    
    # HSPF extraction for existing, baseline equipment (Replacement Costs)
    df['baseline_HSPF'] = df['hvac_heating_efficiency'].str.extract(r'([\d.]+) HSPF').astype(float)

    # HSPF extraction for upgraded equipment (New Install Costs)
    # df['ugrade_newInstall_HSPF'] = df['upgrade_hvac_heating_efficiency'].str.extract(r'(\d+\.\d+)')
    
    return df

In [17]:
def calculate_heating_installation_premium(df, menu_mp, rsMeans_national_avg, cpi_ratio_2023_2013):
    necessary_columns = ['hvac_cooling_type', 'heating_type', 'rsMeans_CCI_avg']
    if not all(column in df.columns for column in necessary_columns):
        raise ValueError("DataFrame does not contain all necessary columns.")
    
    for index, row in df.iterrows():
        # Initialization to zero
        premium_cost = 0
        
        # Installation cost for homes with existing AC
        # Deetjen: Replace SEER 15, 8.5 HSPF ASHP with SEER 15, 8.5 HSPF ASHP: NREL REMDB 50th Percentile Cost is $3300 USD-2013        
        if row['hvac_cooling_type'] != 'None':
            premium_cost = 0
        
        # Installation cost for homes without central AC, but an existing furnace or baseboard
        # Deetjen: Install SEER 15, 8.5 HSPF ASHP: NREL REMDB 50th Percentile Cost is $3700 USD-2013        
        elif 'Furnace' in row['heating_type'] or 'Baseboard' in row['heating_type']:
            premium_cost = 400 * cpi_ratio_2023_2013
        
        # Installation cost for homes without central AC and an existing boiler as heating system
        # Deetjen: Install SEER 15, 8.5 HSPF ASHP: NREL REMDB High Cost is $4800 USD-2013        
        elif 'Boiler' in row['heating_type']:
            premium_cost = 1500 * cpi_ratio_2023_2013
        
        # Apply CPI adjustment above and regional cost index adjustment below
        adjusted_cost = round(premium_cost * (row['rsMeans_CCI_avg'] / rsMeans_national_avg), 2)
        df.at[index, f'mp{menu_mp}_heating_installation_premium'] = adjusted_cost
        
    return df

In [None]:
def load_df_county_medianIncome(project_root, cpi_ratio_2023_2022):
    # Collect Area Median Income Data at county-resolution
    filename = "nhgis0005_ds261_2022_county.csv"
    relative_path = os.path.join(r"equity_data", filename)
    file_path = os.path.join(project_root, relative_path)

    print(f"Retrieved data for filename: {filename}")
    print(f"Located at filepath: {file_path}")
    print("\n")

    df_county_medianIncome = pd.read_csv(file_path, encoding='ISO-8859-1')
    # df_county_medianIncome = df_county_medianIncome.drop(0)
    df_county_medianIncome = df_county_medianIncome.reset_index(drop=True)

    cols_interest = ['GISJOIN', 'STUSAB', 'COUNTYA', 'NAME_E', 'AP2PE001', 'AP2PM001']
    df_county_medianIncome = df_county_medianIncome[cols_interest]
    df_county_medianIncome = df_county_medianIncome.rename(columns={"GISJOIN": "gis_joinID_county", "STUSAB": "state_abbrev", "COUNTYA": "county_code", "NAME_E": "name_estimate", "AP2PE001": "median_income_USD2022", "AP2PM001": "median_income_USD2022_marginOfError"})
    df_county_medianIncome['median_income_USD2023'] = round((df_county_medianIncome['median_income_USD2022'] * cpi_ratio_2023_2022), 2)
    return df_county_medianIncome

In [18]:
# UPDATED AUGUST 22, 2024 @ 9:40 PM (~ENSURE COLS UPDATE WHEN FUNCTION RE-RUN. DROP OLD OVERLAPPING COLS~)
# Replacement Cost Function and Helper Functions (Parametes, Formula)

# Helper function to get parameters based on end use
def get_end_use_replacement_parameters(df, end_use):
    parameters = {
        'heating': {
            'conditions': [
                (df['base_heating_fuel'] == 'Propane'),
                (df['base_heating_fuel'] == 'Fuel Oil'),
                (df['base_heating_fuel'] == 'Natural Gas'),
                (df['base_heating_fuel'] == 'Electricity') & (df['heating_type'] == 'Electricity ASHP'),
                (df['base_heating_fuel'] == 'Electricity')
            ],
            'tech_eff_pairs': [
                ('Propane Furnace', '94 AFUE'),
                ('Fuel Oil Furnace', '95 AFUE'),
                ('Natural Gas Furnace', '95 AFUE'),
                ('Electric ASHP', 'SEER 18, 9.3 HSPF'),
                ('Electric Furnace', '100 AFUE')
            ],
            'cost_components': ['unitCost', 'otherCost', 'cost_per_kBtuh']
        },
        # 'waterHeating': {
        #     'conditions': [
        #         (df['base_waterHeating_fuel'] == 'Fuel Oil'),
        #         (df['base_waterHeating_fuel'] == 'Natural Gas'),
        #         (df['base_waterHeating_fuel'] == 'Propane'),
        #         (df['water_heater_efficiency'].isin(['Electric Standard', 'Electric Premium'])),
        #         (df['water_heater_efficiency'] == 'Electric Heat Pump, 80 gal')
        #     ],
        #     'tech_eff_pairs': [
        #         ('Fuel Oil Water Heater', 0.68),
        #         ('Natural Gas Water Heater', 0.67),
        #         ('Propane Water Heater', 0.67),
        #         ('Electric Water Heater', 0.95),
        #         ('Electric Heat Pump Water Heater, 80 gal', 2.35)
        #     ],
        #     'cost_components': ['unitCost', 'cost_per_gallon']
        # },
        # 'clothesDrying': {
        #     'conditions': [
        #         (df['base_clothesDrying_fuel'] == 'Electricity'),
        #         (df['base_clothesDrying_fuel'] == 'Natural Gas'),
        #         (df['base_clothesDrying_fuel'] == 'Propane')
        #     ],
        #     'tech_eff_pairs': [
        #         ('Electric Clothes Dryer', 3.1),
        #         ('Natural Gas Clothes Dryer', 2.75),
        #         ('Propane Clothes Dryer', 2.75)
        #     ],
        #     'cost_components': ['unitCost']
        # },
        # 'cooking': {
        #     'conditions': [
        #         (df['base_cooking_fuel'] == 'Electricity'),
        #         (df['base_cooking_fuel'] == 'Natural Gas'),
        #         (df['base_cooking_fuel'] == 'Propane')
        #     ],
        #     'tech_eff_pairs': [
        #         ('Electric Range', 0.74),
        #         ('Natural Gas Range', 0.4),
        #         ('Propane Range', 0.4)
        #     ],
        #     'cost_components': ['unitCost']
        # }
    }
    if end_use not in parameters:
        raise ValueError(f"Invalid end_use specified: {end_use}")
    return parameters[end_use]

# UPDATED AUGUST 22, 2024 @ 9:40 PM (~ENSURE COLS UPDATE WHEN FUNCTION RE-RUN. DROP OLD OVERLAPPING COLS~)
def calculate_replacement_cost_per_row(df_valid, sampled_costs_dict, rsMeans_national_avg, menu_mp, end_use):
    """
    Helper function to calculate the replacement cost for each row based on the end use.

    Parameters:
    df_valid (pd.DataFrame): Filtered DataFrame containing valid rows.
    sampled_costs_dict (dict): Dictionary with sampled costs for each component.
    rsMeans_national_avg (float): National average value for cost adjustment.
    menu_mp (int): Menu option identifier.
    end_use (str): Type of end-use to calculate replacement cost for ('heating', 'waterHeating', 'clothesDrying', 'cooking').

    Returns:
    tuple: Tuple containing the calculated replacement costs and the cost column name.
    """
    if end_use == 'heating':
        replacement_cost = (
            sampled_costs_dict['unitCost'] +
            sampled_costs_dict['otherCost'] +
            (df_valid['total_heating_load_kBtuh'] * sampled_costs_dict['cost_per_kBtuh'])
        ) * (df_valid['rsMeans_CCI_avg'] / rsMeans_national_avg)
        cost_column_name = f'mp{menu_mp}_heating_replacementCost'
    elif end_use == 'waterHeating':
        replacement_cost = (
            sampled_costs_dict['unitCost'] +
            (sampled_costs_dict['cost_per_gallon'] * df_valid['size_water_heater_gal'])
        ) * (df_valid['rsMeans_CCI_avg'] / rsMeans_national_avg)
        cost_column_name = f'mp{menu_mp}_waterHeating_replacementCost'
    else:
        replacement_cost = sampled_costs_dict['unitCost'] * (df_valid['rsMeans_CCI_avg'] / rsMeans_national_avg)
        cost_column_name = f'mp{menu_mp}_{end_use}_replacementCost'
    
    return replacement_cost, cost_column_name

# UPDATED AUGUST 22, 2024 @ 9:40 PM (~ENSURE COLS UPDATE WHEN FUNCTION RE-RUN. DROP OLD OVERLAPPING COLS~)
def calculate_replacement_cost(df, cost_dict, rsMeans_national_avg, menu_mp, end_use):
    """
    General function to calculate replacement costs for various end-uses based on fuel types, costs, and efficiency.

    Parameters:
    df (pd.DataFrame): DataFrame containing data for different scenarios.
    cost_dict (dict): Dictionary with cost information for different technology and efficiency combinations.
    rsMeans_national_avg (float): National average value for cost adjustment.
    menu_mp (int): Menu option identifier.
    end_use (str): Type of end-use to calculate replacement cost for ('heating', 'waterHeating', 'clothesDrying', 'cooking').

    Returns:
    pd.DataFrame: Updated DataFrame with calculated replacement costs.
    """
    
    # Validate menu_mp
    valid_menu_mps = [7, 8, 9, 10]
    if menu_mp not in valid_menu_mps:
        raise ValueError("Please enter a valid measure package number for menu_mp. Should be 7, 8, 9, or 10.")
    
    # Get conditions, technology-efficiency pairs, and cost components for the specified end_use
    params = get_end_use_replacement_parameters(df, end_use)
    conditions = params['conditions']
    tech_eff_pairs = params['tech_eff_pairs']
    cost_components = params['cost_components']
   
    # Map each condition to its tech and efficiency
    tech = np.select(conditions, [pair[0] for pair in tech_eff_pairs], default='unknown')
    eff = np.select(conditions, [pair[1] for pair in tech_eff_pairs], default=np.nan)

    # Convert efficiency values to appropriate types
    if end_use == 'heating':
        eff = np.array([str(e) if e != 'unknown' else np.nan for e in eff])
    else:
        eff = np.array([float(e) if e != 'unknown' else np.nan for e in eff])

    # Filter out rows with unknown technology and NaN efficiency
    valid_indices = tech != 'unknown'
    tech = tech[valid_indices]
    eff = eff[valid_indices]
    df_valid = df.loc[valid_indices].copy()

    # Initialize dictionaries to store sampled costs
    sampled_costs_dict = {}

    # Calculate costs for each component
    for cost_component in cost_components:
        progressive_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_progressive', np.nan) for t, e in zip(tech, eff)])
        reference_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_reference', np.nan) for t, e in zip(tech, eff)])
        conservative_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_conservative', np.nan) for t, e in zip(tech, eff)])

        # Handle missing cost data
        if np.isnan(progressive_costs).any() or np.isnan(reference_costs).any() or np.isnan(conservative_costs).any():
            missing_indices = np.where(np.isnan(progressive_costs) | np.isnan(reference_costs) | np.isnan(conservative_costs))
            print(f"Missing data at indices: {missing_indices}")
            print(f"Tech with missing data: {tech[missing_indices]}")
            print(f"Efficiencies with missing data: {eff[missing_indices]}")
            
            raise ValueError(f"Missing cost data for some technology and efficiency combinations in cost_component {cost_component}")

        # Calculate mean and standard deviation assuming the costs represent the 10th, 50th, and 90th percentiles of a normal distribution
        mean_costs = reference_costs
        std_costs = (conservative_costs - progressive_costs) / (norm.ppf(0.90) - norm.ppf(0.10))

        # Sample from the normal distribution for each row
        sampled_costs = np.random.normal(loc=mean_costs, scale=std_costs)
        sampled_costs_dict[cost_component] = sampled_costs

    # Calculate the replacement cost for each row
    replacement_cost, cost_column_name = calculate_replacement_cost_per_row(df_valid, sampled_costs_dict, rsMeans_national_avg, menu_mp, end_use)

    # Add the calculated costs to a new DataFrame, rounded to 2 decimal places
    df_new_columns = pd.DataFrame({cost_column_name: np.round(replacement_cost, 2)}, index=df_valid.index)

    # Identify overlapping columns between the new and existing DataFrame
    overlapping_columns = df_new_columns.columns.intersection(df.columns)

    # Drop overlapping columns from the original DataFrame
    if not overlapping_columns.empty:
        df.drop(columns=overlapping_columns, inplace=True)

    # Merge new columns into the original DataFrame, ensuring no duplicates or overwrites occur
    df = df.join(df_new_columns, how='left')

    return df

In [19]:
# UPDATED AUGUST 22, 2024 @ 9:30 PM (~ENSURE COLS UPDATE WHEN FUNCTION RE-RUN. DROP OLD OVERLAPPING COLS~)

# Installation Cost Function and Helper Functions (Parametes, Formula)
# Helper function to get parameters based on end use
def get_end_use_installation_parameters(df, end_use, menu_mp):
    parameters = {
        'heating': {
            'conditions': [
                (df['hvac_has_ducts'] == 'Yes') & (menu_mp == 7),
                (df['hvac_has_ducts'] == 'No') & (menu_mp == 7),
                (df['hvac_has_ducts'] == 'Yes') & (menu_mp != 7),
                (df['hvac_has_ducts'] == 'No') & (menu_mp != 7)
            ],
            'tech_eff_pairs': [
                ('Electric ASHP', 'SEER 18, 9.3 HSPF'),
                ('Electric MSHP', 'SEER 18, 9.6 HSPF'),
                ('Electric MSHP - Ducted', 'SEER 15.5, 10 HSPF'),
                ('Electric MSHP', 'SEER 29.3, 14 HSPF')
            ],
            'cost_components': ['unitCost', 'otherCost', 'cost_per_kBtuh']
        },
        # 'waterHeating': {
        #     'conditions': [
        #         (df['upgrade_water_heater_efficiency'] == 'Electric Heat Pump, 50 gal, 3.45 UEF'),
        #         (df['upgrade_water_heater_efficiency'] == 'Electric Heat Pump, 66 gal, 3.35 UEF'),
        #         (df['upgrade_water_heater_efficiency'] == 'Electric Heat Pump, 80 gal, 3.45 UEF')
        #     ],
        #     'tech_eff_pairs': [
        #         ('Electric Heat Pump Water Heater, 50 gal', 3.45),
        #         ('Electric Heat Pump Water Heater, 66 gal', 3.35),
        #         ('Electric Heat Pump Water Heater, 80 gal', 3.45),
        #     ],
        #     'cost_components': ['unitCost', 'cost_per_gallon']
        # },
        # 'clothesDrying': {
        #     'conditions': [
        #         df['upgrade_clothes_dryer'].str.contains('Electric, Premium, Heat Pump, Ventless', na=False),
        #         ~df['upgrade_clothes_dryer'].str.contains('Electric, Premium, Heat Pump, Ventless', na=False),
        #     ],
        #     'tech_eff_pairs': [
        #         ('Electric HP Clothes Dryer', 5.2),
        #         ('Electric Clothes Dryer', 3.1),
        #     ],
        #     'cost_components': ['unitCost']
        # },
        # 'cooking': {
        #     'conditions': [
        #         df['upgrade_cooking_range'].str.contains('Electric, Induction', na=False),
        #         ~df['upgrade_cooking_range'].str.contains('Electric, Induction', na=False),
        #     ],
        #     'tech_eff_pairs': [
        #         ('Electric Induction Range', 0.84),
        #         ('Electric Range, Modern', 0.74),
        #     ],
        #     'cost_components': ['unitCost']
        # }
    }
    if end_use not in parameters:
        raise ValueError(f"Invalid end_use specified: {end_use}")
    return parameters[end_use]

# UPDATED AUGUST 22, 2024 @ 9:30 PM (~ENSURE COLS UPDATE WHEN FUNCTION RE-RUN. DROP OLD OVERLAPPING COLS~)
def calculate_installation_cost_per_row(df_valid, sampled_costs_dict, rsMeans_national_avg, menu_mp, end_use):
    """
    Helper function to calculate the installation cost for each row based on the end use.

    Parameters:
    df_valid (pd.DataFrame): Filtered DataFrame containing valid rows.
    sampled_costs_dict (dict): Dictionary with sampled costs for each component.
    rsMeans_national_avg (float): National average value for cost adjustment.
    menu_mp (int): Menu option identifier.
    end_use (str): Type of end-use to calculate installation cost for ('heating', 'waterHeating', 'clothesDrying', 'cooking').

    Returns:
    tuple: Tuple containing the calculated installation costs and the cost column name.
    """
    if end_use == 'heating':
        installation_cost = (
            sampled_costs_dict['unitCost'] +
            sampled_costs_dict['otherCost'] +
            (df_valid['total_heating_load_kBtuh'] * sampled_costs_dict['cost_per_kBtuh'])
        ) * (df_valid['rsMeans_CCI_avg'] / rsMeans_national_avg)
        cost_column_name = f'mp{menu_mp}_heating_installationCost'
    elif end_use == 'waterHeating':
        installation_cost = (
            sampled_costs_dict['unitCost'] +
            (sampled_costs_dict['cost_per_gallon'] * df_valid['size_water_heater_gal'])
        ) * (df_valid['rsMeans_CCI_avg'] / rsMeans_national_avg)
        cost_column_name = f'mp{menu_mp}_waterHeating_installationCost'
    else:
        installation_cost = sampled_costs_dict['unitCost'] * (df_valid['rsMeans_CCI_avg'] / rsMeans_national_avg)
        cost_column_name = f'mp{menu_mp}_{end_use}_installationCost'
    
    return installation_cost, cost_column_name

# UPDATED AUGUST 22, 2024 @ 9:30 PM (~ENSURE COLS UPDATE WHEN FUNCTION RE-RUN. DROP OLD OVERLAPPING COLS~)
def calculate_installation_cost(df, cost_dict, rsMeans_national_avg, menu_mp, end_use):
    """
    General function to calculate installation costs for various end-uses based on fuel types, costs, and efficiency.

    Parameters:
    df (pd.DataFrame): DataFrame containing data for different scenarios.
    cost_dict (dict): Dictionary with cost information for different technology and efficiency combinations.
    rsMeans_national_avg (float): National average value for cost adjustment.
    menu_mp (int): Menu option identifier.
    end_use (str): Type of end-use to calculate installation cost for ('heating', 'waterHeating', 'clothesDrying', 'cooking').

    Returns:
    pd.DataFrame: Updated DataFrame with calculated installation costs.
    """
    
    # Validate menu_mp 
    valid_menu_mps = [7, 8, 9, 10]
    if menu_mp not in valid_menu_mps:
        raise ValueError("Please enter a valid measure package number for menu_mp. Should be 7, 8, 9, or 10.")
    
    # Get conditions, technology-efficiency pairs, and cost components for the specified end_use
    params = get_end_use_installation_parameters(df, end_use, menu_mp)
    conditions = params['conditions']
    tech_eff_pairs = params['tech_eff_pairs']
    cost_components = params['cost_components']
   
    # Map each condition to its tech and efficiency
    tech = np.select(conditions, [pair[0] for pair in tech_eff_pairs], default='unknown')
    eff = np.select(conditions, [pair[1] for pair in tech_eff_pairs], default=np.nan)

    # Convert efficiency values to appropriate types
    if end_use == 'heating':
        eff = np.array([str(e) if e != 'unknown' else np.nan for e in eff])
    else:
        eff = np.array([float(e) if e != 'unknown' else np.nan for e in eff])

    # Filter out rows with unknown technology and NaN efficiency
    valid_indices = tech != 'unknown'
    tech = tech[valid_indices]
    eff = eff[valid_indices]
    df_valid = df.loc[valid_indices].copy()

    # Initialize dictionaries to store sampled costs
    sampled_costs_dict = {}

    # Calculate costs for each component
    for cost_component in cost_components:
        progressive_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_progressive', np.nan) for t, e in zip(tech, eff)])
        reference_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_reference', np.nan) for t, e in zip(tech, eff)])
        conservative_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_conservative', np.nan) for t, e in zip(tech, eff)])
        
        print(f"progressive_costs is {progressive_costs}")
        
        print(f"reference_costs is {reference_costs}")
        
        print(f"conservative_costs is {conservative_costs}")

        # Handle missing cost data
        if np.isnan(progressive_costs).any() or np.isnan(reference_costs).any() or np.isnan(conservative_costs).any():
            missing_indices = np.where(np.isnan(progressive_costs) | np.isnan(reference_costs) | np.isnan(conservative_costs))
            print(f"Missing data at indices: {missing_indices}")
            print(f"Tech with missing data: {tech[missing_indices]}")
            print(f"Efficiencies with missing data: {eff[missing_indices]}")
            
            raise ValueError(f"Missing cost data for some technology and efficiency combinations in cost_component {cost_component}")

        # Calculate mean and standard deviation assuming the costs represent the 10th, 50th, and 90th percentiles of a normal distribution
        mean_costs = reference_costs
        std_costs = (conservative_costs - progressive_costs) / (norm.ppf(0.90) - norm.ppf(0.10))

        # Sample from the normal distribution for each row
        sampled_costs = np.random.normal(loc=mean_costs, scale=std_costs)
        sampled_costs_dict[cost_component] = sampled_costs

    # Calculate the installation cost for each row
    installation_cost, cost_column_name = calculate_installation_cost_per_row(df_valid, sampled_costs_dict, rsMeans_national_avg, menu_mp, end_use)

    # Add the calculated costs to a new DataFrame, rounded to 2 decimal places
    df_new_columns = pd.DataFrame({cost_column_name: np.round(installation_cost, 2)}, index=df_valid.index)

    # Identify overlapping columns between the new and existing DataFrame
    overlapping_columns = df_new_columns.columns.intersection(df.columns)

    # Drop overlapping columns from the original DataFrame
    if not overlapping_columns.empty:
        df.drop(columns=overlapping_columns, inplace=True)

    # Merge new columns into the original DataFrame, ensuring no duplicates or overwrites occur
    df = df.join(df_new_columns, how='left')

    return df

In [None]:
# LAST UPDATE AUGUST 21, 2024 @ 11:40 PM

# POTENTIALLY UPDATE CODE IN THE FUTURE TO ACCOUNT FOR CHANGES IN CAPITAL COSTS BASED ON SCENARIOS (BESIDES IRA REBATES)
# Note: CURRENT MODELING ASSUMES EQUIPMENT PRICES ARE THE SAME UNDER IRA REF AND IRA HIGH
# THIS MAY BE UPDATED IN THE FUTURE, SO WE STILL USE policy_scenario PREFIXES FOR TOTAL AND NET CAPITAL COSTS
# COSTS ARE DIFFERENT FOR PRE-IRA BECAUSE NO REBATES ARE APPLIED
def calculate_private_NPV(df, df_fuelCosts, interest_rate, input_mp, menu_mp, policy_scenario):
    """
    Calculate the private net present value (NPV) for various equipment categories,
    considering different cost assumptions and potential IRA rebates. The function adjusts
    equipment costs for inflation and regional cost differences, and calculates NPV based
    on cost savings between baseline and retrofit scenarios.

    Parameters:
        df (DataFrame): Input DataFrame with installation costs, fuel savings, and potential rebates.
        interest_rate (float): Annual discount rate used for NPV calculation.
        menu_mp (str): Prefix for columns in the DataFrame.
        input_mp (str): Input policy_scenario for calculating costs.
        policy_scenario (str): Policy policy_scenario that determines electricity grid projections. 
                               Accepted values: 'AEO2023 Reference Case'.

    Returns:
        DataFrame: The input DataFrame updated with calculated private NPV and adjusted equipment costs.
    """
    # POTENTIALLY UPDATE CODE IN THE FUTURE TO ACCOUNT FOR CHANGES IN CAPITAL COSTS BASED ON SCENARIOS (BESIDES IRA REBATES)
    # Note: CURRENT MODELING ASSUMES EQUIPMENT PRICES ARE THE SAME UNDER IRA REF AND IRA HIGH
    # THIS MAY BE UPDATED IN THE FUTURE, SO WE STILL USE policy_scenario PREFIXES FOR TOTAL AND NET CAPITAL COSTS
    # COSTS ARE DIFFERENT FOR PRE-IRA BECAUSE NO REBATES ARE APPLIED   
    equipment_specs = {
        'heating': 15,
        # 'waterHeating': 12,
        # 'clothesDrying': 13,
        # 'cooking': 15
    }
    
    df_copy = df.copy()

    df_fuelCosts_copy = df_fuelCosts.copy()

    df_new_columns = pd.DataFrame(index=df_copy.index)
    
    for category, lifetime in equipment_specs.items():
        # print(f"\nCalculating for category: {category} with lifetime: {lifetime}")
        
        total_capital_cost, net_capital_cost = calculate_costs(df_copy, category, input_mp, menu_mp, policy_scenario)
        
        # print(f"Total capital cost for {category}: {total_capital_cost}")
        # print(f"Net capital cost for {category}: {net_capital_cost}")
        
        calculate_and_update_npv(df_new_columns, df_fuelCosts_copy, category, menu_mp, interest_rate, lifetime, total_capital_cost, net_capital_cost, policy_scenario)
      
    overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)
    if not overlapping_columns.empty:
        df_copy.drop(columns=overlapping_columns, inplace=True)

    df_copy = df_copy.join(df_new_columns, how='left')
    # print("Final DataFrame after NPV calculations:\n", df_copy.head())
    return df_copy

def calculate_costs(df_copy, category, input_mp, menu_mp, policy_scenario):
    """
    Calculate total and net capital costs based on the equipment category and cost assumptions.

    Parameters:
        df_copy (DataFrame): DataFrame containing cost data.
        category (str): Equipment category.
        menu_mp (str): Prefix for columns in the DataFrame.
        input_mp (str): Input policy_scenario for calculating costs.
        ira_rebates (bool): Flag indicating whether IRA rebates are applied.

    Returns:
        tuple: Total and net capital costs.
    """
    print(f"""\nCalculating costs for {category}...
          input_mp: {input_mp}, menu_mp: {menu_mp}, policy_scenario: {policy_scenario}""")


    # POTENTIALLY UPDATE CODE IN THE FUTURE TO ACCOUNT FOR CHANGES IN CAPITAL COSTS BASED ON SCENARIOS (BESIDES IRA REBATES)
    # Note: CURRENT MODELING ASSUMES EQUIPMENT PRICES ARE THE SAME UNDER IRA REF AND IRA HIGH
    # THIS MAY BE UPDATED IN THE FUTURE, SO WE STILL USE policy_scenario PREFIXES FOR TOTAL AND NET CAPITAL COSTS
    # COSTS ARE DIFFERENT FOR PRE-IRA BECAUSE NO REBATES ARE APPLIED
    if policy_scenario == 'No Inflation Reduction Act':
        if category == 'heating':
            if input_mp == 'upgrade09':            
                weatherization_cost = df_copy[f'mp9_enclosure_upgradeCost'].fillna(0)
            elif input_mp == 'upgrade10':
                weatherization_cost = df_copy[f'mp10_enclosure_upgradeCost'].fillna(0)
            else:
                weatherization_cost = 0.0
            # print(f"Weatherization cost (no IRA rebates): {weatherization_cost}")
            
            total_capital_cost = (df_copy[f'mp{menu_mp}_{category}_installationCost'].fillna(0) + 
                                  weatherization_cost + 
                                  df_copy[f'mp{menu_mp}_heating_installation_premium'].fillna(0))
            net_capital_cost = total_capital_cost - df_copy[f'mp{menu_mp}_{category}_replacementCost'].fillna(0)
            
        else:
            total_capital_cost = df_copy[f'mp{menu_mp}_{category}_installationCost'].fillna(0)
            net_capital_cost = total_capital_cost - df_copy[f'mp{menu_mp}_{category}_replacementCost'].fillna(0)
    
    # POTENTIALLY UPDATE CODE IN THE FUTURE TO ACCOUNT FOR CHANGES IN CAPITAL COSTS BASED ON SCENARIOS (BESIDES IRA REBATES)
    # Note: CURRENT MODELING ASSUMES EQUIPMENT PRICES ARE THE SAME UNDER IRA REF AND IRA HIGH
    # THIS MAY BE UPDATED IN THE FUTURE, SO WE STILL USE policy_scenario PREFIXES FOR TOTAL AND NET CAPITAL COSTS
    # COSTS ARE DIFFERENT FOR PRE-IRA BECAUSE NO REBATES ARE APPLIED
    else:
        if category == 'heating':
            if input_mp == 'upgrade09':            
                weatherization_cost = df_copy[f'mp9_enclosure_upgradeCost'].fillna(0) - df_copy[f'weatherization_rebate_amount'].fillna(0)
            elif input_mp == 'upgrade10':
                weatherization_cost = df_copy[f'mp10_enclosure_upgradeCost'].fillna(0) - df_copy[f'weatherization_rebate_amount'].fillna(0)
            else:
                weatherization_cost = 0.0       
            # print(f"Weatherization cost (with IRA rebates): {weatherization_cost}")
            
            installation_cost = (df_copy[f'mp{menu_mp}_{category}_installationCost'].fillna(0) + 
                                 weatherization_cost + 
                                 df_copy[f'mp{menu_mp}_{category}_installation_premium'].fillna(0))
            
            rebate_amount = df_copy[f'mp{menu_mp}_{category}_rebate_amount'].fillna(0)
            total_capital_cost = installation_cost - rebate_amount
            net_capital_cost = total_capital_cost - df_copy[f'mp{menu_mp}_{category}_replacementCost'].fillna(0)
        
        else:
            installation_cost = df_copy[f'mp{menu_mp}_{category}_installationCost'].fillna(0)
            rebate_amount = df_copy[f'mp{menu_mp}_{category}_rebate_amount'].fillna(0)
            total_capital_cost = installation_cost - rebate_amount
            net_capital_cost = total_capital_cost - df_copy[f'mp{menu_mp}_{category}_replacementCost'].fillna(0)

    # print(f"Calculated total_capital_cost: {total_capital_cost}, net_capital_cost: {net_capital_cost}")
    return total_capital_cost, net_capital_cost

def calculate_and_update_npv(df_new_columns, df_fuelCosts_copy, category, menu_mp, interest_rate, lifetime, total_capital_cost, net_capital_cost, policy_scenario):
    """
    Calculate and update the NPV values in the DataFrame based on provided capital costs.

    Parameters:
        df_new_columns (DataFrame): DataFrame to update.
        df_fuelCosts_copy (DataFrame): Original DataFrame containing savings data.
        category (str): Equipment category.
        menu_mp (str): Prefix for columns in the DataFrame.
        interest_rate (float): Discount rate for NPV calculation.
        lifetime (int): Expected lifetime of the equipment.
        total_capital_cost (float): Total capital cost of the equipment.
        net_capital_cost (float): Net capital cost after considering replacements.
        ira_rebates (bool): Flag to consider IRA rebates in calculations.
    """
    # Determine the policy_scenario prefix based on the policy policy_scenario
    if policy_scenario == 'No Inflation Reduction Act':
        scenario_prefix = f"preIRA_mp{menu_mp}_"
    elif policy_scenario == 'AEO2023 Reference Case':
        scenario_prefix = f"iraRef_mp{menu_mp}_"
    else:
        raise ValueError("Invalid Policy policy_scenario! Please choose from 'AEO2023 Reference Case'.")
        
    print(f"""\nCalculating Private NPV for {category}...
          lifetime: {lifetime}, interest_rate: {interest_rate}, policy_scenario: {policy_scenario}
          """)

    # Calculate the discounted savings for each year
    discounted_savings = []
    for year in range(1, lifetime + 1):
        year_label = year + 2023  # Adjust the start year as necessary
        annual_savings = df_fuelCosts_copy[f'{scenario_prefix}{year_label}_{category}_savings_fuelCost'].fillna(0)
        discount_factor = (1 / ((1 + interest_rate) ** year))
        discounted_savings.append(annual_savings * discount_factor)
        # print(f"Year {year_label} savings for {category}: {annual_savings}, discounted: {annual_savings * discount_factor}")
    
    # Sum up the discounted savings over the lifetime
    total_discounted_savings = sum(discounted_savings)
    # print(f"Total discounted savings over {lifetime} years for {category}: {total_discounted_savings}")
    
    # Calculate NPV for less WTP and more WTP scenarios
    npv_lessWTP = round(total_discounted_savings - total_capital_cost, 2)
    npv_moreWTP = round(total_discounted_savings - net_capital_cost, 2)
    
    # POTENTIALLY UPDATE CODE IN THE FUTURE TO ACCOUNT FOR CHANGES IN CAPITAL COSTS BASED ON SCENARIOS (BESIDES IRA REBATES)
    # Note: CURRENT MODELING ASSUMES EQUIPMENT PRICES ARE THE SAME UNDER IRA REF AND IRA HIGH
    # THIS MAY BE UPDATED IN THE FUTURE, SO WE STILL USE policy_scenario PREFIXES FOR TOTAL AND NET CAPITAL COSTS
    # COSTS ARE DIFFERENT FOR PRE-IRA BECAUSE NO REBATES ARE APPLIED
    df_new_columns[f'{scenario_prefix}{category}_total_capitalCost'] = total_capital_cost
    df_new_columns[f'{scenario_prefix}{category}_net_capitalCost'] = net_capital_cost
        
    df_new_columns[f'{scenario_prefix}{category}_private_npv_lessWTP'] = npv_lessWTP
    df_new_columns[f'{scenario_prefix}{category}_private_npv_moreWTP'] = npv_moreWTP
        
    # print(f"Updated df_new_columns with NPV for {category}:\n", df_new_columns[[col for col in df_new_columns.columns if category in col]].head())

In [21]:
# # UPDATED AUGUST 21, 2024 @ 11:40 PM
# def adoption_decision(df, policy_scenario):
#     """
#     Updates the provided DataFrame with new columns that reflect decisions about equipment adoption
#     and public impacts based on net present values (NPV). The function handles different scenarios
#     based on input flags for incentives and grid decarbonization.

#     Parameters:
#         df (pandas.DataFrame): The DataFrame containing home equipment data.
#         policy_scenario (str): Policy policy_scenario that determines electricity grid projections. 
#                                Accepted values: 'AEO2023 Reference Case'.

#     Returns:
#         pandas.DataFrame: The modified DataFrame with additional columns for decisions and impacts.

#     Notes:
#         - It adds columns for both individual and public economic evaluations.
#         - Adoption decisions and public impacts are dynamically calculated based on the input parameters.
#     """
#     df_copy = df.copy()
    
#     # Define the lifetimes of different equipment categories
#     upgrade_columns = {
#         'heating': 'upgrade_hvac_heating_efficiency',
#         'waterHeating': 'upgrade_water_heater_efficiency',
#         'clothesDrying': 'upgrade_clothes_dryer',
#         'cooking': 'upgrade_cooking_range'
#     }
    
#     df_new_columns = pd.DataFrame(index=df_copy.index)  # DataFrame to hold new or modified columns

#     # Determine the policy_scenario prefix based on the policy policy_scenario
#     if policy_scenario == 'No Inflation Reduction Act':
#         scenario_prefix = f"preIRA_mp{menu_mp}_"
#     elif policy_scenario == 'AEO2023 Reference Case':
#         scenario_prefix = f"iraRef_mp{menu_mp}_"
#     else:
#         raise ValueError("Invalid Policy Scenario! Please choose from 'No Inflation Reduction Act' or 'AEO2023 Reference Case'.")

#     # Iterate over each equipment category and its respective upgrade column
#     for category, upgrade_column in upgrade_columns.items():
#         # Column names for net NPV, private NPV, and public NPV
#         lessWTP_total_npv_col = f'{scenario_prefix}{category}_total_npv_lessWTP' # LESS WTP: BREAK EVEN ON TOTAL CAPITAL COSTS
#         moreWTP_total_npv_col = f'{scenario_prefix}{category}_total_npv_moreWTP' # MORE WTP: BREAK EVEN ON NET CAPITAL COSTS (BETTER THAN ALTERNATIVE)

#         lessWTP_private_npv_col = f'{scenario_prefix}{category}_private_npv_lessWTP' # LESS WTP: BREAK EVEN ON TOTAL CAPITAL COSTS
#         moreWTP_private_npv_col = f'{scenario_prefix}{category}_private_npv_moreWTP' # MORE WTP: BREAK EVEN ON NET CAPITAL COSTS (BETTER THAN ALTERNATIVE)

#         public_npv_col = f'{scenario_prefix}{category}_public_npv'

#         # Ensure columns are numeric if they exist and convert them
#         for col in [lessWTP_private_npv_col, moreWTP_private_npv_col, public_npv_col]:
#             if col in df.columns:
#                 df[col] = pd.to_numeric(df[col], errors='coerce')
#             else:
#                 print(f"Warning: {col} does not exist in the DataFrame.")

#         # Ensure the columns are present after conversion
#         if lessWTP_private_npv_col in df.columns and moreWTP_private_npv_col in df.columns and public_npv_col in df.columns:
#             # 
            
#             # Calculate net NPV by summing private and public NPVs
#             df_new_columns[lessWTP_total_npv_col] = df[lessWTP_private_npv_col] + df[public_npv_col] # LESS WTP: BREAK EVEN ON TOTAL CAPITAL COSTS
#             df_new_columns[moreWTP_total_npv_col] = df[moreWTP_private_npv_col] + df[public_npv_col] # MORE WTP: BREAK EVEN ON NET CAPITAL COSTS (BETTER THAN ALTERNATIVE)

#             # Initialize columns for adoption decisions and public impact
#             adoption_col_name = f'{scenario_prefix}{category}_adoption'
#             retrofit_col_name = f'{scenario_prefix}{category}_retrofit_publicImpact'
#             df_new_columns[adoption_col_name] = 'Tier 4: Averse'  # Default value for all rows
#             df_new_columns[retrofit_col_name] = 'No Retrofit'  # Default public impact

#             # Conditions for determining adoption decisions
#             conditions = [
#                 df[upgrade_column].isna(),
#                 df[lessWTP_private_npv_col] > 0,
#                 (df[lessWTP_private_npv_col] < 0) & (df[moreWTP_private_npv_col] > 0),
#                 (df[lessWTP_private_npv_col] < 0) & (df[moreWTP_private_npv_col] <= 0) & (df_new_columns[moreWTP_total_npv_col] > 0),
#             ]

#             choices = ['Existing Equipment', 'Tier 1: Feasible', 'Tier 2: Feasible vs. Alternative', 'Tier 3: Subsidy-Dependent Feasibility']
#             df_new_columns[adoption_col_name] = np.select(conditions, choices, default='Tier 4: Averse')

#             # Conditions and choices for public impacts
#             public_conditions = [
#                 df[public_npv_col] > 0,
#                 df[public_npv_col] < 0
#             ]
            
#             public_choices = ['Public Benefit', 'Public Detriment']
#             df_new_columns[retrofit_col_name] = np.select(public_conditions, public_choices, default='No Retrofit')
#         else:
#             print(f"Warning: One or more columns ({lessWTP_private_npv_col}, {moreWTP_private_npv_col}, {public_npv_col}) are missing or not numeric.")
    
#     # Identify overlapping columns between the new and existing DataFrame.
#     overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)

#     # Drop overlapping columns from df_copy.
#     if not overlapping_columns.empty:
#         df_copy.drop(columns=overlapping_columns, inplace=True)

#     # Merge new columns into df_copy, ensuring no duplicates or overwrites occur.
#     df_copy = df_copy.join(df_new_columns, how='left')

#     # Return the updated DataFrame.
#     return df_copy

In [None]:
# UPDATED SEPTEMBER 14, 2024 @ 4:23 PM
def adoption_decision(df, policy_scenario, menu_mp):
    """
    Updates the provided DataFrame with new columns that reflect decisions about equipment adoption
    and public impacts based on net present values (NPV). The function handles different scenarios
    based on input flags for incentives and grid decarbonization.

    Parameters:
        df (pandas.DataFrame): The DataFrame containing home equipment data.
        policy_scenario (str): Policy policy_scenario that determines electricity grid projections. 
                               Accepted values: 'AEO2023 Reference Case'.

    Returns:
        pandas.DataFrame: The modified DataFrame with additional columns for decisions and impacts.

    Notes:
        - It adds columns for both individual and public economic evaluations.
        - Adoption decisions and public impacts are dynamically calculated based on the input parameters.
    """
    df_copy = df.copy()
    
    # Define the lifetimes of different equipment categories
    upgrade_columns = {
        'heating': 'upgrade_hvac_heating_efficiency',
        # 'waterHeating': 'upgrade_water_heater_efficiency',
        # 'clothesDrying': 'upgrade_clothes_dryer',
        # 'cooking': 'upgrade_cooking_range'
    }
    
    df_new_columns = pd.DataFrame(index=df_copy.index)  # DataFrame to hold new or modified columns

    # Determine the policy_scenario prefix based on the policy policy_scenario
    if policy_scenario == 'No Inflation Reduction Act':
        scenario_prefix = f"preIRA_mp{menu_mp}_"
    elif policy_scenario == 'AEO2023 Reference Case':
        scenario_prefix = f"iraRef_mp{menu_mp}_"
    else:
        raise ValueError("Invalid Policy Scenario! Please choose from 'No Inflation Reduction Act' or 'AEO2023 Reference Case'.")

    # Iterate over each equipment category and its respective upgrade column
    for category, upgrade_column in upgrade_columns.items():
        # Column names for net NPV, private NPV, and public NPV
        lessWTP_private_npv_col = f'{scenario_prefix}{category}_private_npv_lessWTP' # LESS WTP: BREAK EVEN ON TOTAL CAPITAL COSTS
        moreWTP_private_npv_col = f'{scenario_prefix}{category}_private_npv_moreWTP' # MORE WTP: BREAK EVEN ON NET CAPITAL COSTS (BETTER THAN ALTERNATIVE)

        public_npv_col = f'{scenario_prefix}{category}_public_npv'
        rebate_col = f'mp{menu_mp}_{category}_rebate_amount'
        addition_public_benefit = f'{scenario_prefix}{category}_additional_public_benefit'

        lessWTP_total_npv_col = f'{scenario_prefix}{category}_total_npv_lessWTP' # LESS WTP: BREAK EVEN ON TOTAL CAPITAL COSTS
        moreWTP_total_npv_col = f'{scenario_prefix}{category}_total_npv_moreWTP' # MORE WTP: BREAK EVEN ON NET CAPITAL COSTS (BETTER THAN ALTERNATIVE)
        # Ensure columns are numeric if they exist and convert them
        for col in [lessWTP_private_npv_col, moreWTP_private_npv_col, public_npv_col, rebate_col]:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                print(f"Warning: {col} does not exist in the DataFrame.")

        # Ensure the columns are present after conversion
        if lessWTP_private_npv_col in df.columns and moreWTP_private_npv_col in df.columns and public_npv_col in df.columns:
            # No IRA Rebate so no "Additional Public Benefit"
            if policy_scenario == 'No Inflation Reduction Act':
                df_new_columns[addition_public_benefit] = 0.0
            else:
                # Calculate Additional Public Benefit with IRA Rebates Accounted For and clip at 0
                df_new_columns[addition_public_benefit] = (df[public_npv_col] - df[rebate_col]).clip(lower=0)
            
            # Calculate Total NPV by summing private and public NPVs
            df_new_columns[lessWTP_total_npv_col] = df[lessWTP_private_npv_col] + df_new_columns[addition_public_benefit] # LESS WTP: BREAK EVEN ON TOTAL CAPITAL COSTS
            df_new_columns[moreWTP_total_npv_col] = df[moreWTP_private_npv_col] + df_new_columns[addition_public_benefit] # MORE WTP: BREAK EVEN ON NET CAPITAL COSTS (BETTER THAN ALTERNATIVE)

            # Initialize columns for adoption decisions and public impact
            adoption_col_name = f'{scenario_prefix}{category}_adoption'
            retrofit_col_name = f'{scenario_prefix}{category}_retrofit_publicImpact'
            df_new_columns[adoption_col_name] = 'Tier 4: Averse'  # Default value for all rows
            df_new_columns[retrofit_col_name] = 'No Retrofit'  # Default public impact

            # Conditions for determining adoption decisions
            conditions = [
                df[upgrade_column].isna(),
                df[lessWTP_private_npv_col] > 0,
                (df[lessWTP_private_npv_col] < 0) & (df[moreWTP_private_npv_col] > 0),
                (df[lessWTP_private_npv_col] < 0) & (df[moreWTP_private_npv_col] <= 0) & (df_new_columns[moreWTP_total_npv_col] > 0) & (df_new_columns[addition_public_benefit] > 0), # Ensures only Tier 3 for IRA Scenario
            ]

            choices = ['Existing Equipment', 'Tier 1: Feasible', 'Tier 2: Feasible vs. Alternative', 'Tier 3: Subsidy-Dependent Feasibility']
            df_new_columns[adoption_col_name] = np.select(conditions, choices, default='Tier 4: Averse')

            # Conditions and choices for public impacts
            public_conditions = [
                df[public_npv_col] > 0,
                df[public_npv_col] < 0
            ]
            
            public_choices = ['Public Benefit', 'Public Detriment']
            df_new_columns[retrofit_col_name] = np.select(public_conditions, public_choices, default='No Retrofit')
        else:
            print(f"Warning: One or more columns ({lessWTP_private_npv_col}, {moreWTP_private_npv_col}, {public_npv_col}) are missing or not numeric.")
    
    # Identify overlapping columns between the new and existing DataFrame.
    overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)

    # Drop overlapping columns from df_copy.
    if not overlapping_columns.empty:
        df_copy.drop(columns=overlapping_columns, inplace=True)

    # Merge new columns into df_copy, ensuring no duplicates or overwrites occur.
    df_copy = df_copy.join(df_new_columns, how='left')

    # Return the updated DataFrame.
    return df_copy

In [23]:
# def check_adoption_consistency(df, category, upgrade_column):
#     df_copy = df.copy()
    
#     cols_to_display = ['bldg_id',
#                        f'base_{category}_fuel',
#                        f'{upgrade_column}',
#                        f'baseline_{category}_consumption',
#                        f'mp{menu_mp}_{category}_consumption',
#                        f'mp{menu_mp}_{category}_reduction_consumption',
#                        f'baseline_{category}_fuelCost',
#                        f'mp{menu_mp}_{category}_fuelCost',        
#                        f'mp{menu_mp}_{category}_savings_fuelCost',
#                        f'mp{menu_mp}_{category}_net_capitalCost',
#                        f'mp{menu_mp}_{category}_private_npv',
#                        f'baseline_{category}_damages_health',
#                        f'baseline_{category}_damages_climate',
#                        f'mp{menu_mp}_{category}_damages_health',
#                        f'mp{menu_mp}_{category}_damages_climate',
#                        f'mp{menu_mp}_{category}_reduction_damages_health',
#                        f'mp{menu_mp}_{category}_reduction_damages_climate',
#                        f'mp{menu_mp}_{category}_public_npv',
#                        f'mp{menu_mp}_{category}_retrofit_publicImpact',
#                        f'mp{menu_mp}_{category}_total_npv',
#                        f'mp{menu_mp}_{category}_adoption',  
#                        ]    
        
#     # Filter the dataframe to show only the columns relevant for the current cost_type
#     df_filtered = df_copy[cols_to_display]
    
#     return df_filtered

In [None]:
# UPDATED SEPTEMBER 6, 2024 @ 12:10 AM
import pandas as pd
import numpy as np
from scipy.stats import norm

def generate_household_medianIncome_2023(row):
    # Inflate the income bins to USD 2023 first
    low = row['income_low'] * cpi_ratio_2023_2022
    high = row['income_high'] * cpi_ratio_2023_2022
    mean = row['income'] * cpi_ratio_2023_2022
    
    # Calculate std assuming 10th and 90th percentiles
    std = (high - low) / (norm.ppf(0.90) - norm.ppf(0.10))
    
    # Sample from the normal distribution
    ami_2023 = np.random.normal(loc=mean, scale=std)
    
    # Ensure the generated income is within the bounds
    ami_2023 = max(low, min(high, ami_2023))
    return ami_2023

def fill_na_with_hierarchy(df, df_puma, df_county, df_state):
    """
    Fills NaN values in 'census_area_medianIncome' using a hierarchical lookup:
    first using the Puma level, then county, and finally state level median incomes.

    Parameters:
        df (DataFrame): The main DataFrame with NaNs to fill.
        df_puma (DataFrame): DataFrame with median incomes at the Puma level.
        df_county (DataFrame): DataFrame with median incomes at the county level.
        df_state (DataFrame): DataFrame with median incomes at the state level.
    
    Returns:
        DataFrame: Modified DataFrame with NaNs filled in 'census_area_medianIncome'.
    """

    # First, attempt to fill using Puma-level median incomes
    df['census_area_medianIncome'] = df['puma'].map(
        df_puma.set_index('gis_joinID_puma')['median_income_USD2023']
    )

    # Find the rows where 'census_area_medianIncome' is NaN
    nan_mask = df['census_area_medianIncome'].isna()

    # Attempt to fill NaNs using county-level median incomes
    df.loc[nan_mask, 'census_area_medianIncome'] = df.loc[nan_mask, 'county'].map(
        df_county.set_index('gis_joinID_county')['median_income_USD2023']
    )

    # Update the NaN mask after attempting to fill with county-level data
    nan_mask = df['census_area_medianIncome'].isna()

    # Attempt to fill remaining NaNs using state-level median incomes
    df.loc[nan_mask, 'census_area_medianIncome'] = df.loc[nan_mask, 'state'].map(
        df_state.set_index('state_abbrev')['median_income_USD2023']
    )
    
    return df

def calculate_percent_AMI(df_results_IRA):
    """
    Calculates the percentage of Area Median Income (AMI) and assigns a designation based on the income level.

    Parameters:
        df_results_IRA (DataFrame): Input DataFrame containing income information.

    Returns:
        DataFrame: Modified DataFrame with additional columns for income calculations and designation.
    """
    # Create a mapping for income ranges
    income_map = {
        '<10000': (9999.0, 9999.0),
        '200000+': (200000.0, 200000.0)
    }

    # Split the income ranges and map values
    def split_income_range(income):
        if isinstance(income, float):  # Handle float income directly
            return income, income
        if income in income_map:
            return income_map[income]
        try:
            low, high = map(float, income.split('-'))
            return low, high
        except Exception as e:
            raise ValueError(f"Unexpected income format: {income}") from e

    # Apply the income range split
    income_ranges = df_results_IRA['income'].apply(split_income_range)
    df_results_IRA['income_low'], df_results_IRA['income_high'] = zip(*income_ranges)
    df_results_IRA['income'] = (df_results_IRA['income_low'] + df_results_IRA['income_high']) / 2
    
    # Apply the generate_household_medianIncome_2023 function
    df_results_IRA['household_income'] = df_results_IRA.apply(generate_household_medianIncome_2023, axis=1)

    # Drop the intermediate columns
    df_results_IRA.drop(['income_low', 'income_high'], axis=1, inplace=True)

    # Fill NaNs in 'census_area_medianIncome' with the hierarchical lookup
    # Attempt to match median income for puma, then county, then state
    df_results_IRA = fill_na_with_hierarchy(df_results_IRA, df_puma=df_puma_medianIncome, df_county=df_county_medianIncome, df_state=df_state_medianIncome)

    # Ensure income and census_area_medianIncome columns are float
    df_results_IRA['household_income'] = df_results_IRA['household_income'].astype(float).round(2)
    df_results_IRA['census_area_medianIncome'] = df_results_IRA['census_area_medianIncome'].astype(float).round(2)

    # Calculate percent_AMI
    df_results_IRA['percent_AMI'] = ((df_results_IRA['household_income'] / df_results_IRA['census_area_medianIncome']) * 100).round(2)

    # Categorize the income level based on percent_AMI
    conditions_lmi = [
        df_results_IRA['percent_AMI'] <= 80.0,
        (df_results_IRA['percent_AMI'] > 80.0) & (df_results_IRA['percent_AMI'] <= 150.0)
    ]
    choices_lmi = ['Low-Income', 'Moderate-Income']

    df_results_IRA['lowModerateIncome_designation'] = np.select(
        conditions_lmi, choices_lmi, default='Middle-to-Upper-Income'
    )

    # Output the modified DataFrame
    return df_results_IRA

In [None]:
# UPDATED AUGUST 20, 2024 @ 3:08 AM
# Mapping for categories and their corresponding conditions
rebate_mapping = {
    'heating': ('upgrade_hvac_heating_efficiency', ['ASHP', 'MSHP'], 8000.00),
    'waterHeating': ('upgrade_water_heater_efficiency', ['Electric Heat Pump'], 1750.00),
    'clothesDrying': ('upgrade_clothes_dryer', ['Electric, Premium, Heat Pump, Ventless'], 840.00),
    'cooking': ('upgrade_cooking_range', ['Electric, '], 840.00)
}

def get_max_rebate_amount(row, category):
    """
    Determine the maximum rebate amounts based on the category and row data.
    """
    if category in rebate_mapping:
        column, conditions, rebate_amount = rebate_mapping[category]
        max_rebate_amount = rebate_amount if any(cond in str(row[column]) for cond in conditions) else 0.00
    else:
        max_rebate_amount = 0.00

    max_weatherization_rebate_amount = 1600.00
    return max_rebate_amount, max_weatherization_rebate_amount

def calculate_rebate(df_results_IRA, row, category, menu_mp, coverage_rate):
    """
    Calculate and assign the rebate amounts.
    """
    max_rebate_amount, max_weatherization_rebate_amount = get_max_rebate_amount(row, category)
    
    project_coverage = round(row[f'mp{menu_mp}_{category}_installationCost'] * coverage_rate, 2)
    df_results_IRA.at[row.name, f'mp{menu_mp}_{category}_rebate_amount'] = min(project_coverage, max_rebate_amount)
    
    if f'mp{menu_mp}_enclosure_upgradeCost' in df_results_IRA.columns:
        weatherization_project_coverage = round(row[f'mp{menu_mp}_enclosure_upgradeCost'] * coverage_rate, 2)
        df_results_IRA.at[row.name, 'weatherization_rebate_amount'] = min(weatherization_project_coverage, max_weatherization_rebate_amount)

def calculate_rebateIRA(df_results_IRA, category, menu_mp):
    """
    Calculates rebate amounts for different end-uses based on income designation.
    """
    def apply_rebate(row):
        income_designation = row['lowModerateIncome_designation']
        if income_designation == 'Low-Income':
            calculate_rebate(df_results_IRA, row, category, menu_mp, 1.00)
        elif income_designation == 'Moderate-Income':
            calculate_rebate(df_results_IRA, row, category, menu_mp, 0.50)
        else:
            df_results_IRA.at[row.name, f'mp{menu_mp}_{category}_rebate_amount'] = 0.00
            if menu_mp in [9, 10]:
                df_results_IRA.at[row.name, 'weatherization_rebate_amount'] = 0.00

    df_results_IRA.apply(apply_rebate, axis=1)
    return df_results_IRA

In [26]:
# def check_ira_adoption_consistency(df, category, upgrade_column):
#     df_copy = df.copy()
    
#     cols_to_display = ['bldg_id',
#                        f'base_{category}_fuel',
#                        f'{upgrade_column}',
#                        f'baseline_{category}_consumption',
#                        f'mp{menu_mp}_{category}_consumption',
#                        f'mp{menu_mp}_{category}_reduction_consumption',
#                        f'baseline_{category}_fuelCost',
#                        f'mp{menu_mp}_{category}_fuelCost',        
#                        f'mp{menu_mp}_{category}_savings_fuelCost',
#                        f'mp{menu_mp}_{category}_net_capitalCost',
#                        f'mp{menu_mp}_{category}_private_npv',
#                        f'baseline_{category}_damages_health',
#                        f'baseline_{category}_damages_climate',
#                        f'mp{menu_mp}_{category}_damages_health',
#                        f'mp{menu_mp}_{category}_damages_climate',
#                        f'mp{menu_mp}_{category}_reduction_damages_health',
#                        f'mp{menu_mp}_{category}_reduction_damages_climate',
#                        f'mp{menu_mp}_{category}_public_npv',
#                        f'mp{menu_mp}_{category}_retrofit_publicImpact',
#                        f'mp{menu_mp}_{category}_total_npv',
#                        f'mp{menu_mp}_{category}_adoption',
#                        f'ira_mp{menu_mp}_{category}_net_capitalCost',
#                        f'ira_mp{menu_mp}_{category}_private_npv',
#                        f'ira_mp{menu_mp}_{category}_total_npv',
#                        f'ira_mp{menu_mp}_{category}_adoption',
#                        ]    

#     # Filter the dataframe to show only the relevant columns
#     df_filtered = df_copy[cols_to_display]
    
#     return df_filtered

## Moderate Retrofit (MP9): MP8 + Basic Enclosure

## Advanced Retrofit (MP10): MP8 + Enhanced Enclosure
**Notes**
- There are some inconsistencies for variable names and syntax for calculations
- The calculations should still end up the same regardless because of order of operations
- Plan to update for consistency to avoid user confusion.

In [27]:
# UPDATED AUGUST 22, 2024 @ 7:00 PM
import numpy as np
import pandas as pd
from scipy.stats import norm

# Helper function to get conditions and tech-efficiency pairs for enclosure retrofit
def get_enclosure_parameters(df, retrofit_col):
    if retrofit_col == 'insulation_atticFloor_upgradeCost':
        conditions = [
            (df['upgrade_insulation_atticFloor'] == 'R-30') & (df['base_insulation_atticFloor'] == 'R-13'),
            (df['upgrade_insulation_atticFloor'] == 'R-30') & (df['base_insulation_atticFloor'] == 'R-7'),
            (df['upgrade_insulation_atticFloor'] == 'R-30') & (df['base_insulation_atticFloor'] == 'Uninsulated'),
            (df['upgrade_insulation_atticFloor'] == 'R-49') & (df['base_insulation_atticFloor'] == 'R-30'),
            (df['upgrade_insulation_atticFloor'] == 'R-49') & (df['base_insulation_atticFloor'] == 'R-19'),
            (df['upgrade_insulation_atticFloor'] == 'R-49') & (df['base_insulation_atticFloor'] == 'R-13'),
            (df['upgrade_insulation_atticFloor'] == 'R-49') & (df['base_insulation_atticFloor'] == 'R-7'),
            (df['upgrade_insulation_atticFloor'] == 'R-49') & (df['base_insulation_atticFloor'] == 'Uninsulated'),
            (df['upgrade_insulation_atticFloor'] == 'R-60') & (df['base_insulation_atticFloor'] == 'R-38'),
            (df['upgrade_insulation_atticFloor'] == 'R-60') & (df['base_insulation_atticFloor'] == 'R-30'),
            (df['upgrade_insulation_atticFloor'] == 'R-60') & (df['base_insulation_atticFloor'] == 'R-19'),
            (df['upgrade_insulation_atticFloor'] == 'R-60') & (df['base_insulation_atticFloor'] == 'R-13'),
            (df['upgrade_insulation_atticFloor'] == 'R-60') & (df['base_insulation_atticFloor'] == 'R-7'),
            (df['upgrade_insulation_atticFloor'] == 'R-60') & (df['base_insulation_atticFloor'] == 'Uninsulated')
        ]
        tech_eff_pairs = [
            ('Attic Floor Insulation: R-30', 'R-13'),
            ('Attic Floor Insulation: R-30', 'R-7'),
            ('Attic Floor Insulation: R-30', 'Uninsulated'),
            ('Attic Floor Insulation: R-49', 'R-30'),
            ('Attic Floor Insulation: R-49', 'R-19'),
            ('Attic Floor Insulation: R-49', 'R-13'),
            ('Attic Floor Insulation: R-49', 'R-7'),
            ('Attic Floor Insulation: R-49', 'Uninsulated'),
            ('Attic Floor Insulation: R-60', 'R-38'),
            ('Attic Floor Insulation: R-60', 'R-30'),
            ('Attic Floor Insulation: R-60', 'R-19'),
            ('Attic Floor Insulation: R-60', 'R-13'),
            ('Attic Floor Insulation: R-60', 'R-7'),
            ('Attic Floor Insulation: R-60', 'Uninsulated')
        ]
    elif retrofit_col == 'infiltration_reduction_upgradeCost':
        conditions = [
            (df['upgrade_infiltration_reduction'] == '30%')
        ]
        tech_eff_pairs = [
            ('Air Leakage Reduction: 30% Reduction', 'Varies')
        ]
    elif retrofit_col == 'duct_sealing_upgradeCost':
        conditions = [
            (df['upgrade_duct_sealing'] == '10% Leakage, R-8') & (df['base_ducts'].str.contains('10% Leakage')),
            (df['upgrade_duct_sealing'] == '10% Leakage, R-8') & (df['base_ducts'].str.contains('20% Leakage')),
            (df['upgrade_duct_sealing'] == '10% Leakage, R-8') & (df['base_ducts'].str.contains('30% Leakage')),
        ]
        tech_eff_pairs = [
            ('Duct Sealing: 10% Leakage, R-8', '10% Leakage'),
            ('Duct Sealing: 10% Leakage, R-8', '20% Leakage'),
            ('Duct Sealing: 10% Leakage, R-8', '30% Leakage'),
        ]
    elif retrofit_col == 'insulation_wall_upgradeCost':
        conditions = [
            (df['upgrade_insulation_wall'] == 'Wood Stud, R-13')
        ]
        tech_eff_pairs = [
            ('Drill-and-fill Wall Insulation: Wood Stud, R-13', 'Wood Stud, Uninsulated')
        ]
    elif retrofit_col == 'insulation_foundation_wall_upgradeCost':
        conditions = [
            (df['upgrade_insulation_foundation_wall'] == 'Wall R-10, Interior')
        ]
        tech_eff_pairs = [
            ('Foundation Wall Insulation: Wall R-10, Interior', 'Uninsulated')
        ]
    elif retrofit_col == 'insulation_rim_joist_upgradeCost':
        conditions = [
            (df['base_insulation_foundation_wall'] == 'Uninsulated') & (df['base_foundation_type'].isin(['Unvented Crawlspace', 'Vented Crawlspace', 'Heated Basement']))
        ]
        tech_eff_pairs = [
            ('Rim Joist Insulation: Wall R-10, Exterior', 'Uninsulated')
        ]
    elif retrofit_col == 'seal_crawlspace_upgradeCost':
        conditions = [
            (df['upgrade_seal_crawlspace'] == 'Unvented Crawlspace')
        ]
        tech_eff_pairs = [
            ('Seal Vented Crawlspace: Unvented Crawlspace', 'Vented Crawlspace')
        ]
    elif retrofit_col == 'insulation_roof_upgradeCost':
        conditions = [
            (df['upgrade_insulation_roof'] == 'Finished, R-30')
        ]
        tech_eff_pairs = [
            ('Insulate Finished Attics and Cathedral Ceilings: Finished, R-30', 'R-30')
        ]
    else:
        raise ValueError(f"Invalid retrofit_col specified: {retrofit_col}")
    
    return {'conditions': conditions, 'tech_eff_pairs': tech_eff_pairs}

# UPDATED AUGUST 22, 2024 @ 7:00 PM
def calculate_enclosure_retrofit_upgradeCosts(df, cost_dict, retrofit_col, params_col, rsMeans_national_avg):
    """
    Calculate the enclosure retrofit upgrade costs based on given parameters and conditions.

    Parameters:
    df (pd.DataFrame): DataFrame containing data for different scenarios.
    cost_dict (dict): Dictionary with cost information for different technology and efficiency combinations.
    retrofit_col (str): Column name for the retrofit cost.
        - NaN value indicates that the retrofit was not performed.
    params_col (str): Column name for the parameter to use in the cost calculation.
    rsMeans_national_avg (float): National average value for cost adjustment.

    Returns:
    pd.DataFrame: Updated DataFrame with calculated retrofit costs.
    """
    
    # Create a copy of the original DataFrame to avoid modifying it directly
    df_copy = df.copy()

    # Get conditions and tech-efficiency pairs for the specified retrofit
    params = get_enclosure_parameters(df_copy, retrofit_col)
    conditions = params['conditions']
    tech_eff_pairs = params['tech_eff_pairs']

    # # Debug: Print the extracted parameters
    # print("Extracted Parameters:", params)

    # Map each condition to its tech and efficiency
    tech = np.select(conditions, [pair[0] for pair in tech_eff_pairs], default='unknown')
    eff = np.select(conditions, [pair[1] for pair in tech_eff_pairs], default='unknown')

    # # Debug: Print the mapped tech and efficiency pairs
    # print("Mapped Tech:", tech)
    # print("Mapped Efficiency:", eff)

    # Filter out rows with unknown technology and efficiency
    valid_indices = tech != 'unknown'
    tech = tech[valid_indices]
    eff = eff[valid_indices]
    df_valid = df_copy.loc[valid_indices].copy()

    # # Debug: Print the valid indices and corresponding tech-efficiency pairs
    # print("Valid Indices:", valid_indices)
    # print("Valid Tech:", tech)
    # print("Valid Efficiency:", eff)

    # Initialize dictionary to store sampled costs
    sampled_costs_dict = {}

    # Calculate costs for each component (normalized_cost)
    for cost_component in ['normalized_cost']:
        progressive_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_progressive', np.nan) for t, e in zip(tech, eff)])
        reference_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_reference', np.nan) for t, e in zip(tech, eff)])
        conservative_costs = np.array([cost_dict.get((t, e), {}).get(f'{cost_component}_conservative', np.nan) for t, e in zip(tech, eff)])

        # Handle missing cost data
        if np.isnan(progressive_costs).any() or np.isnan(reference_costs).any() or np.isnan(conservative_costs).any():
            missing_indices = np.where(np.isnan(progressive_costs) | np.isnan(reference_costs) | np.isnan(conservative_costs))
            print(f"Missing data at indices: {missing_indices}")
            print(f"Tech with missing data: {tech[missing_indices]}")
            print(f"Efficiencies with missing data: {eff[missing_indices]}")
            
            raise ValueError(f"Missing cost data for some technology and efficiency combinations in cost_component {cost_component}")

        # Calculate mean and standard deviation assuming the costs represent the 10th, 50th, and 90th percentiles of a normal distribution
        mean_costs = reference_costs
        std_costs = (conservative_costs - progressive_costs) / (norm.ppf(0.90) - norm.ppf(0.10))

        # Sample from the normal distribution for each row
        sampled_costs = np.random.normal(loc=mean_costs, scale=std_costs)
        sampled_costs_dict[cost_component] = sampled_costs

    # Calculate the retrofit cost for each row
    retrofit_cost = (
        sampled_costs_dict['normalized_cost'] * df_valid[params_col]
    ) * (df_valid['rsMeans_CCI_avg'] / rsMeans_national_avg)

    # Add the calculated costs to a new DataFrame, rounded to 2 decimal places
    df_new_columns = pd.DataFrame({retrofit_col: np.round(retrofit_cost, 2)}, index=df_valid.index)

    # Identify overlapping columns between the new and existing DataFrame
    overlapping_columns = df_new_columns.columns.intersection(df_copy.columns)

    # Drop overlapping columns from the original DataFrame
    if not overlapping_columns.empty:
        df_copy.drop(columns=overlapping_columns, inplace=True)

    # Merge new columns into the original DataFrame, ensuring no duplicates or overwrites occur
    df_copy = df_copy.join(df_new_columns, how='left')

    return df_copy

# Storing Output Results and Data Visualization

## Save Results: Merge DFs and Export to CSV

In [28]:
def clean_df_merge(df_compare, df_results_IRA, df_results_IRA_gridDecarb):
    # Identify common columns (excluding 'bldg_id' which is the merging key)
    common_columns_IRA = set(df_compare.columns) & set(df_results_IRA.columns)
    common_columns_IRA.discard('bldg_id')
        
    # Drop duplicate columns in df_results_IRA and merge
    df_results_IRA = df_results_IRA.drop(columns=common_columns_IRA)
    print(f"""Dropped the following duplicate columns before merge: 
    {common_columns_IRA}
    """)
    merged_df = pd.merge(df_compare, df_results_IRA, on='bldg_id', how='inner')

    # Repeat the steps above for the merged_df and df_results_IRA_gridDecarb
    common_columnsIRA_gridDecarb = set(merged_df.columns) & set(df_results_IRA_gridDecarb.columns)
    common_columnsIRA_gridDecarb.discard('bldg_id')
    df_results_IRA_gridDecarb = df_results_IRA_gridDecarb.drop(columns=common_columnsIRA_gridDecarb)
    print(f"""Dropped the following duplicate columns before merge: 
    {common_columnsIRA_gridDecarb}
    """)
        
    # Create cleaned, merged results df with no duplicate columns
    df_results_export = pd.merge(merged_df, df_results_IRA_gridDecarb, on='bldg_id', how='inner')
    print("Dataframes have been cleaned of duplicate columns and merged successfully. Ready to export!")
    return df_results_export

In [None]:
def export_model_run_output(df_results_export, results_category, menu_mp):
    """
    Exports data for results summaries (npv, adoption, impact) and supplemental info (consumption, damages, fuel costs)

    Parameters:
    df_results_export (pd.DataFrame): DataFrame containing data for different scenarios.
    results_category (str): Determines the type of info being exported.
        - Accepted: 'summary', 'consumption', 'damages', 'fuelCost'
    menu_mp (int or str): Determines the measure package or retrofit being conducted
    
    """
    print("-------------------------------------------------------------------------------------------------------")
    # Baseline model run results
    if results_category == 'summary':
        if menu_mp == '0' or menu_mp==0:
            results_filename = f"baseline_results_{location_id}_{results_export_formatted_date}.csv"
            print(f"BASELINE RESULTS:")
            print(f"Dataframe results will be saved in this csv file: {results_filename}")

            # Change the directory to the upload folder and export the file
            results_change_directory = "baseline_summary"

        # Measure Package model run results
        else:
            if menu_mp == '8' or menu_mp==8:
                print(f"MEASURE PACKAGE {menu_mp} (MP{menu_mp}) RESULTS:")
                results_filename = f"mp{menu_mp}_results_{location_id}_{results_export_formatted_date}.csv"
                print(f"Dataframe results will be saved in this csv file: {results_filename}")

                # Change the directory to the upload folder and export the file
                results_change_directory = "retrofit_basic_summary"

            elif menu_mp == '9' or menu_mp==9:
                results_filename = f"mp{menu_mp}_results_{location_id}_{results_export_formatted_date}.csv"
                print(f"MEASURE PACKAGE {menu_mp} (MP{menu_mp}) RESULTS:")
                print(f"Dataframe results will be saved in this csv file: {results_filename}")

                # Change the directory to the upload folder and export the file
                results_change_directory = "retrofit_moderate_summary"

            elif menu_mp == '10' or menu_mp==10:
                results_filename = f"mp{menu_mp}_results_{location_id}_{results_export_formatted_date}.csv"
                print(f"MEASURE PACKAGE {menu_mp} (MP{menu_mp}) RESULTS:")
                print(f"Dataframe results will be saved in this csv file: {results_filename}")

                # Change the directory to the upload folder and export the file
                results_change_directory = "retrofit_advanced_summary"

            else:
                print("No matching scenarios for this Measure Package (MP)")

    # This includes exported dataframes for calculated consumption, damages, and fuel costs
    else:
        results_filename = f"mp{menu_mp}_data_{results_category}_{location_id}_{results_export_formatted_date}.csv"
        print(f"SUPPLEMENTAL INFORMATION DATAFRAME: {results_category}")
        print(f"Dataframe results will be saved in this csv file: {results_filename}")

        # Change the directory to the upload folder and export the file
        results_change_directory = f"supplemental_data_{results_category}"

    # Export dataframe results as a csv to the specified filepath
    results_export_filepath = os.path.join(output_folder_path, results_change_directory, results_filename)
    os.makedirs(results_export_filepath, exist_ok=True)
    df_results_export.to_csv(results_export_filepath)
    print(f"Dataframe for MP{menu_mp} {results_category} results were exported here: {results_export_filepath}")
    print("-------------------------------------------------------------------------------------------------------", "\n")

## Convert Results Output CSVs to Dataframes

In [30]:
def load_scenario_data(end_use, output_folder_path, scenario_string, model_run_date_time, columns_to_string):
    # Construct the output folder path with the policy_scenario of interest
    scenario_folder_path = os.path.join(output_folder_path, scenario_string)
    print(f"Output Results Folder Path: {scenario_folder_path}")

    # List all files in the specified folder with the specified date in the filename
    files = [f for f in os.listdir(scenario_folder_path) if os.path.isfile(os.path.join(scenario_folder_path, f)) and model_run_date_time in f]

    # Initialize dataframe as None
    df_outputs = None

    # Assume there is one main file per policy_scenario that includes all necessary data
    if files:
        file_path = os.path.join(scenario_folder_path, files[0])  # Assumes the first file is the correct one

        if os.path.exists(file_path):
            df_outputs = pd.read_csv(file_path, index_col=0, dtype=columns_to_string)
            print(f"Loaded {end_use} data for policy_scenario '{scenario_string}'", "\n")
        else:
            print("File not found for the specified policy_scenario", "\n")

    if df_outputs is None:
        print(f"No {end_use} data found for policy_scenario '{scenario_string}'")

    return df_outputs

## Visuals for Public and Private Perspective

In [31]:
# Added base fuel color-coded legend
# Possibly update colors to make more color blind accessible
color_map_fuel = {
    'Electricity': 'seagreen',
    'Natural Gas': 'steelblue',
    'Propane': 'orange',
    'Fuel Oil': 'firebrick',
}

# Define a function to plot the histogram and percentile subplot
def create_subplot_histogram(ax, df, x_col, bin_number, x_label=None, y_label=None, lower_percentile=2.5, upper_percentile=97.5, color_code='base_fuel', statistic='count', include_zero=False, show_legend=False):
    df_copy = df.copy()
    
    if not include_zero:
        df_copy[x_col] = df_copy[x_col].replace(0, np.nan)

    lower_limit = df_copy[x_col].quantile(lower_percentile / 100)
    upper_limit = df_copy[x_col].quantile(upper_percentile / 100)

    valid_data = df_copy[x_col][(df_copy[x_col] >= lower_limit) & (df_copy[x_col] <= upper_limit)]

    # Get the corresponding color for each fuel category
    colors = [color_map_fuel.get(fuel, 'gray') for fuel in df_copy[color_code].unique()]

    # Set the hue_order to match the unique fuel categories and their corresponding colors
    hue_order = [fuel for fuel in df_copy[color_code].unique() if fuel in color_map_fuel]

    ax = sns.histplot(data=df_copy, x=valid_data, kde=False, bins=bin_number, hue=color_code, hue_order=hue_order, stat=statistic, multiple="stack", palette=colors, ax=ax, legend=show_legend)

    if x_label is not None:
        ax.set_xlabel(x_label, fontsize=22)  # Set font size for x-axis label

    if y_label is not None:
        ax.set_ylabel(y_label, fontsize=22)  # Set font size for y-axis label

    ax.set_xlim(left=lower_limit, right=upper_limit)

    # Set font size for tick labels
    ax.tick_params(axis='both', labelsize=22)

    sns.despine()

def create_subplot_grid_histogram(df, subplot_positions, x_cols, x_labels, y_label=None, bin_number=20, lower_percentile=2.5, upper_percentile=97.5, statistic='count', color_code='base_fuel', include_zero=False, suptitle=None, sharex=False, sharey=False, column_titles=None, show_legend=True, figure_size=(12, 10), export_filename=None, export_format='png', dpi=300):
    num_subplots = len(subplot_positions)
    num_cols = max(pos[1] for pos in subplot_positions) + 1
    num_rows = max(pos[0] for pos in subplot_positions) + 1

    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figure_size, sharex=sharex, sharey=sharey)

    # Create a dictionary to map subplot positions to their respective axes
    subplot_axes = {(pos[0], pos[1]): axes[pos[0], pos[1]] for pos in subplot_positions}

    # Define the parameters for each histogram subplot
    plot_params = [{'ax': subplot_axes[pos], 'x_col': col, 'x_label': label, 'y_label': y_label, 'bin_number': bin_number, 'lower_percentile': lower_percentile, 'upper_percentile': upper_percentile, 'statistic': statistic, 'color_code': color_code, 'include_zero': include_zero, 'show_legend': show_legend}
                   for pos, col, label in zip(subplot_positions, x_cols, x_labels)]

    # Plot each histogram subplot using the defined parameters
    for params in plot_params:
        create_subplot_histogram(df=df, **params)

    # Add a super title to the entire figure if suptitle is provided
    if suptitle:
        plt.suptitle(suptitle, fontweight='bold')

    # Add titles over the columns
    if column_titles:
        for col_index, title in enumerate(column_titles):
            axes[0, col_index].set_title(title, fontsize=22, fontweight='bold')
    
    # If sharey is True, remove y-axis labels on all subplots except the leftmost ones in each row
    if sharey:
        for row_index in range(num_rows):
            for col_index in range(num_cols):
                if col_index > 0:
                    axes[row_index, col_index].set_yticklabels([])

    # Add a legend for the color mapping at the bottom of the entire figure
    legend_labels = list(color_map_fuel.keys())
    legend_handles = [plt.Rectangle((0, 0), 1, 1, color=color_map_fuel[label]) for label in legend_labels]
    fig.legend(legend_handles, legend_labels, loc='lower center', ncol=len(legend_labels), prop={'size': 22}, labelspacing=0.5, bbox_to_anchor=(0.5, -0.05))             
    
    # Adjust the layout
    plt.tight_layout()
    
    # Export the figure if export_filename is provided
    if export_filename:
        save_figure_path = os.path.join(save_figure_directory, export_filename)
        plt.savefig(save_figure_path, format=export_format, dpi=dpi)
    # Otherwise show the plot in Jupyter Notebook
    else:
        plt.show()

In [32]:
# # LAST UPDATED SEPTEMBER 12, 2024
# def subplot_grid_co2_abatement(dataframes, subplot_positions, epa_scc_values, x_cols, y_cols, hues, plot_titles=None, x_labels=None, y_labels=None, suptitle=None, figure_size=(12, 10), sharex=False, sharey=False):
#     """
#     Creates a grid of subplots to visualize CO2 abatement cost effectiveness across different datasets and scenarios.
#     """
#     num_subplots = len(subplot_positions)
#     num_cols = max(pos[1] for pos in subplot_positions) + 1
#     num_rows = max(pos[0] for pos in subplot_positions) + 1

#     fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figure_size, sharex=sharex, sharey=sharey)
#     axes = np.array(axes).reshape(num_rows, num_cols)  # Ensure axes is always 2D

#     for idx, (df, epa_scc, x_col, y_col, hue) in enumerate(zip(dataframes, epa_scc_values, x_cols, y_cols, hues)):
#         pos = subplot_positions[idx]
#         ax = axes[pos[0], pos[1]]
#         title = plot_titles[idx] if plot_titles else ""
#         x_label = x_labels[idx] if x_labels else ""
#         y_label = y_labels[idx] if y_labels else ""

#         # Plot using the plot_co2_abatement function, passing the current axis to it
#         plot_co2_abatement(df, x_col, y_col, hue, epa_scc, ax=ax)

#         # Set custom labels and title if provided
#         ax.set_xlabel(x_label, fontweight='bold', fontsize=18)
#         ax.set_ylabel(y_label, fontweight='bold', fontsize=18)
#         ax.set_title(title, fontweight='bold', fontsize=18)

#         # Set font size for tick labels on the x-axis
#         ax.tick_params(axis='x', labelsize=18)

#         # Set font size for tick labels on the y-axis
#         ax.tick_params(axis='y', labelsize=18)

#     if suptitle:
#         plt.suptitle(suptitle, fontweight='bold')

#     # Create a consolidated legend by grabbing handles and labels from all subplots
#     handles, labels = [], []
#     for ax in axes.flatten():
#         for handle, label in zip(*ax.get_legend_handles_labels()):
#             if label not in labels:  # Avoid duplicates
#                 handles.append(handle)
#                 labels.append(label)

#     # # Add the consolidated legend outside the plots
#     # fig.legend(handles, labels, loc='lower center', ncol=5, prop={'size': 18}, labelspacing=0.25, bbox_to_anchor=(0.5, -0.01))

#     # # Adjust the layout
#     # plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the layout to leave space for the suptitle

#     # Add the consolidated legend outside the plots
#     fig.legend(handles, labels, loc='lower center', ncol=5, prop={'size': 16}, labelspacing=0.25, handletextpad=1, columnspacing=1, bbox_to_anchor=(0.5, -0.01), bbox_transform=fig.transFigure)

#     # Fine-tune the layout adjustment if needed
#     plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjusted the rect to leave space for the suptitle and legend

#     plt.show()

# def plot_co2_abatement(df, x_col, y_col, hue, epa_scc_usd2023_per_ton, ax=None):
#     """
#     Plots a boxplot of CO2 abatement cost effectiveness.

#     Parameters:
#     - df: DataFrame containing the data.
#     - x_col: Column name for the x-axis.
#     - y_col: Column name for the y-axis.
#     - hue: Column name for the hue (categorical variable for color).
#     - epa_scc_usd2023_per_ton: Value for the red dashed line indicating SCC.
#     - ax: Axis object to plot on. If None, creates a new plot.
    
#     Returns:
#     - None: Displays the plot.
#     """
#     # Filter out the 'Middle-to-Upper-Income' rows
#     df_filtered = df[df[x_col] != 'Middle-to-Upper-Income']

#     # Color map for fuel types
#     color_map_fuel = {
#         'Electricity': 'seagreen',
#         'Natural Gas': 'steelblue',
#         'Propane': 'orange',
#         'Fuel Oil': 'firebrick',
#     }

#     if ax is None:
#         ax = plt.gca()

#     # Create the boxplot
#     sns.boxplot(
#         data=df_filtered,
#         x=x_col, 
#         y=y_col, 
#         hue=hue, 
#         palette=color_map_fuel, 
#         showfliers=False,
#         width=0.8,
#         ax=ax
#     )

#     # Add a red dashed line at the value of epa_scc_usd2023_per_ton
#     ax.axhline(y=epa_scc_usd2023_per_ton, color='red', linestyle='--', linewidth=2, label=f'SCC (USD2023): ${int(round((epa_scc_usd2023_per_ton), 0))}/mtCO2e')

#     # Remove the individual legend for each subplot
#     ax.legend_.remove()

In [None]:
# LAST UPDATED SEPTEMBER 19, 2024
def subplot_grid_co2_abatement(dataframes, subplot_positions, epa_scc_values, x_cols, y_cols, hues, plot_titles=None, x_labels=None, y_labels=None, suptitle=None, figure_size=(12, 10), sharex=False, sharey=False):
    """
    Creates a grid of subplots to visualize CO2 abatement cost effectiveness across different datasets and scenarios.
    """
    num_cols = max(pos[1] for pos in subplot_positions) + 1
    num_rows = max(pos[0] for pos in subplot_positions) + 1

    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figure_size, sharex=sharex, sharey=sharey)
    axes = np.array(axes).reshape(num_rows, num_cols)  # Ensure axes is always 2D

    for idx, (df, epa_scc, x_col, y_col, hue) in enumerate(zip(dataframes, epa_scc_values, x_cols, y_cols, hues)):
        pos = subplot_positions[idx]
        ax = axes[pos[0], pos[1]]
        title = plot_titles[idx] if plot_titles else ""
        x_label = x_labels[idx] if x_labels else ""
        y_label = y_labels[idx] if y_labels else ""

        # Plot using the plot_co2_abatement function, passing the current axis to it
        plot_co2_abatement(df, x_col, y_col, hue, epa_scc, ax=ax)

        # Set custom labels and title if provided
        ax.set_xlabel(x_label, fontweight='bold', fontsize=18)
        ax.set_ylabel(y_label, fontweight='bold', fontsize=18)
        ax.set_title(title, fontweight='bold', fontsize=18)

        # Set font size for tick labels on the x-axis
        ax.tick_params(axis='x', labelsize=18)

        # Set font size for tick labels on the y-axis
        ax.tick_params(axis='y', labelsize=18)

    if suptitle:
        plt.suptitle(suptitle, fontweight='bold')

    # Create a consolidated legend by grabbing handles and labels from all subplots
    handles, labels = [], []
    for ax in axes.flatten():
        for handle, label in zip(*ax.get_legend_handles_labels()):
            if label not in labels:  # Avoid duplicates
                handles.append(handle)
                labels.append(label)

    # # Add the consolidated legend outside the plots
    # fig.legend(handles, labels, loc='lower center', ncol=5, prop={'size': 18}, labelspacing=0.25, bbox_to_anchor=(0.5, -0.01))

    # # Adjust the layout
    # plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the layout to leave space for the suptitle

    # Add the consolidated legend outside the plots
    fig.legend(handles, labels, loc='lower center', ncol=5, prop={'size': 16}, labelspacing=0.25, handletextpad=1, columnspacing=1, bbox_to_anchor=(0.5, -0.05), bbox_transform=fig.transFigure)

    # Fine-tune the layout adjustment if needed
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjusted the rect to leave space for the suptitle and legend

    plt.show()

def plot_co2_abatement(df, x_col, y_col, hue, epa_scc_usd2023_per_ton, ax=None):
    """
    Plots a boxplot of CO2 abatement cost effectiveness.

    Parameters:
    - df: DataFrame containing the data.
    - x_col: Column name for the x-axis.
    - y_col: Column name for the y-axis.
    - hue: Column name for the hue (categorical variable for color).
    - epa_scc_usd2023_per_ton: Value for the red dashed line indicating SCC.
    - ax: Axis object to plot on. If None, creates a new plot.
    
    Returns:
    - None: Displays the plot.
    """
    # Filter out the 'Middle-to-Upper-Income' rows and create a copy to avoid SettingWithCopyWarning
    df_copy = df.copy()
    df_filtered = df_copy[df_copy[x_col] != 'Middle-to-Upper-Income']

    # If x_col is categorical, remove unused categories
    if df_filtered[x_col].dtype.name == 'category':
        df_filtered.loc[:, x_col] = df_filtered[x_col].cat.remove_unused_categories()

    # Color map for fuel types
    color_map_fuel = {
        'Electricity': 'seagreen',
        'Natural Gas': 'steelblue',
        'Propane': 'orange',
        'Fuel Oil': 'firebrick',
    }

    if ax is None:
        ax = plt.gca()

    # Create the boxplot
    sns.boxplot(
        data=df_filtered,
        x=x_col, 
        y=y_col, 
        hue=hue, 
        palette=color_map_fuel, 
        showfliers=False,
        width=0.8,
        ax=ax
    )

    # Add a red dashed line at the value of epa_scc_usd2023_per_ton
    ax.axhline(y=epa_scc_usd2023_per_ton, color='red', linestyle='--', linewidth=2, label=f'SCC (USD2023): ${int(round((epa_scc_usd2023_per_ton), 0))}/mtCO2e')

    # Remove the individual legend for each subplot
    ax.legend_.remove()

# Adoption Rate Scenario Comparison

In [None]:
# # LAST UPDATED SEPTEMBER 9, 2024 @ 12:45 AM
# def create_df_adoption(df, menu_mp):
#     """
#     Generates a new DataFrame with specific adoption columns based on provided parameters.
    
#     Args:
#     df (pd.DataFrame): Original DataFrame.
#     menu_mp (int): Measure package identifier.

#     Returns:
#     pd.DataFrame: A DataFrame with the selected columns.
#     """    
#     # Create a copy of the dataframe
#     df_copy = df.copy()

#     # Begin df with these cols
#     df_copy['scc_usd2023_per_ton'] = np.round(epa_scc_usd2023_per_ton, 2)

#     summary_cols = ['bldg_id', 'state', 'city', 'county', 'puma', 'percent_AMI', 'lowModerateIncome_designation', 'scc_usd2023_per_ton']

#     # for category in ['heating', 'waterHeating', 'clothesDrying', 'cooking']:
#     for category in ['heating', 'waterHeating']:
#         df_copy[f'iraRef_{category}_usd2023_per_mtCO2e'] = round((df_copy[f'mp{menu_mp}_{category}_rebate_amount'] / df_copy[f'iraRef_mp{menu_mp}_{category}_avoided_tons_co2e']), 2)
        
#         cols_to_add = [
#             f'base_{category}_fuel',
#             f'mp{menu_mp}_{category}_rebate_amount', 
#             f'iraRef_mp{menu_mp}_{category}_avoided_tons_co2e', 
#             f'iraRef_{category}_usd2023_per_mtCO2e',
#             f'iraRef_mp{menu_mp}_{category}_public_npv',
#             f'iraRef_mp{menu_mp}_{category}_private_npv_lessWTP', 
#             f'iraRef_mp{menu_mp}_{category}_total_capitalCost', 
#             f'iraRef_mp{menu_mp}_{category}_private_npv_moreWTP', 
#             f'iraRef_mp{menu_mp}_{category}_net_capitalCost',
#             f'iraRef_mp{menu_mp}_{category}_adoption'
#         ]
        
#         # Use extend instead of append to add each element of cols_to_add to summary_cols
#         summary_cols.extend(cols_to_add)
        
#     # Select the relevant columns
#     df_copy = df_copy[summary_cols]

#     return df_copy

In [None]:
# LAST UPDATED SEPTEMBER 14, 2024 @ 1:00 AM
def create_df_adoption(df, menu_mp, category):
    """
    Generates a new DataFrame with specific adoption columns based on provided parameters.
    
    Args:
    df (pd.DataFrame): Original DataFrame.
    menu_mp (int): Measure package identifier.

    Returns:
    pd.DataFrame: A DataFrame with the selected columns.
    """    
    # Create a copy of the dataframe
    df_copy = df.copy()

    # Begin df with these cols
    df_copy['scc_usd2023_per_ton'] = np.round(epa_scc_usd2023_per_ton, 2)

    summary_cols = ['bldg_id', 'state', 'city', 'county', 'puma', 'percent_AMI', 'lowModerateIncome_designation', 'scc_usd2023_per_ton']

    df_copy[f'iraRef_{category}_usd2023_per_mtCO2e'] = round((df_copy[f'mp{menu_mp}_{category}_rebate_amount'] / df_copy[f'iraRef_mp{menu_mp}_{category}_avoided_tons_co2e']), 2)
            
    cols_to_add = [f'base_{category}_fuel',
                   f'preIRA_mp{menu_mp}_{category}_avoided_tons_co2e', 
                   f'preIRA_mp{menu_mp}_{category}_public_npv',
                   f'preIRA_mp{menu_mp}_{category}_private_npv_lessWTP', 
                   f'preIRA_mp{menu_mp}_{category}_total_capitalCost', 
                   f'preIRA_mp{menu_mp}_{category}_private_npv_moreWTP', 
                   f'preIRA_mp{menu_mp}_{category}_net_capitalCost',
                   f'preIRA_mp{menu_mp}_{category}_adoption',
                   f'mp{menu_mp}_{category}_rebate_amount', 
                   f'iraRef_mp{menu_mp}_{category}_avoided_tons_co2e', 
                   f'iraRef_{category}_usd2023_per_mtCO2e',
                   f'iraRef_mp{menu_mp}_{category}_public_npv',
                   f'iraRef_mp{menu_mp}_{category}_additional_public_benefit',
                   f'iraRef_mp{menu_mp}_{category}_private_npv_lessWTP', 
                   f'iraRef_mp{menu_mp}_{category}_total_capitalCost', 
                   f'iraRef_mp{menu_mp}_{category}_private_npv_moreWTP', 
                   f'iraRef_mp{menu_mp}_{category}_net_capitalCost',
                   f'iraRef_mp{menu_mp}_{category}_adoption'
                   ]
            
    # Use extend instead of append to add each element of cols_to_add to summary_cols
    summary_cols.extend(cols_to_add)

    # Select the relevant columns
    df_copy = df_copy[summary_cols]

    return df_copy

In [None]:
# UPDATED SEPTEMBER 14, 2024 @ 5:00 PM
import pandas as pd

def filter_columns(df):
    keep_columns = [col for col in df.columns if 'Tier 1: Feasible' in col[1] or 
                    'Tier 2: Feasible vs. Alternative' in col[1] or 
                    'Tier 3: Subsidy-Dependent Feasibility' in col[1] or 
                    'Total Adoption Potential' in col[1] or 
                    'Total Adoption Potential (Additional Subsidy)' in col[1]]    
    
    return df.loc[:, keep_columns]

def create_multiIndex_adoption_df(df, menu_mp, category):
    # Explicitly set 'lowModerateIncome_designation' as a categorical type with order
    income_categories = ['Low-Income', 'Moderate-Income', 'Middle-to-Upper-Income']

    df['lowModerateIncome_designation'] = pd.Categorical(df['lowModerateIncome_designation'], categories=income_categories, ordered=True)
    
    # Define the columns for adoption data
    adoption_cols = [f'preIRA_mp{menu_mp}_{category}_adoption', 
                     f'iraRef_mp{menu_mp}_{category}_adoption']

    # Group by f'base_{category}_fuel' and 'lowModerateIncome_designation', calculate normalized counts
    percentages_df = df.groupby([f'base_{category}_fuel', 'lowModerateIncome_designation'], observed=False)[adoption_cols].apply(
        lambda x: x.apply(lambda y: y.value_counts(normalize=True))).unstack().fillna(0) * 100
    percentages_df = percentages_df.round(0)

    # Ensure 'Tier 1: Feasible' columns exist, set to 0 if they don't
    for column in adoption_cols:
        if (column, 'Tier 1: Feasible') not in percentages_df.columns:
            percentages_df[(column, 'Tier 1: Feasible')] = 0
        if (column, 'Tier 2: Feasible vs. Alternative') not in percentages_df.columns:
            percentages_df[(column, 'Tier 2: Feasible vs. Alternative')] = 0
        if (column, 'Tier 3: Subsidy-Dependent Feasibility') not in percentages_df.columns:
            percentages_df[(column, 'Tier 3: Subsidy-Dependent Feasibility')] = 0

        percentages_df[(column, 'Total Adoption Potential')] = (
            percentages_df[(column, 'Tier 1: Feasible')] + 
            percentages_df[(column, 'Tier 2: Feasible vs. Alternative')]
        )

        percentages_df[(column, 'Total Adoption Potential (Additional Subsidy)')] = (
            percentages_df[(column, 'Tier 1: Feasible')] + 
            percentages_df[(column, 'Tier 2: Feasible vs. Alternative')] + 
            percentages_df[(column, 'Tier 3: Subsidy-Dependent Feasibility')]
        )

    # Rebuild the column MultiIndex
    percentages_df.columns = pd.MultiIndex.from_tuples(percentages_df.columns)
    
    # Filter DataFrame to keep relevant columns only
    filtered_df = filter_columns(percentages_df)

    new_order = []
    for prefix in ['preIRA_mp', 'iraRef_mp']:
        for suffix in ['Tier 1: Feasible', 'Tier 2: Feasible vs. Alternative', 'Tier 3: Subsidy-Dependent Feasibility', 'Total Adoption Potential', 'Total Adoption Potential (Additional Subsidy)']:
            col = (f'{prefix}{menu_mp}_{category}_adoption', suffix)
            if col in filtered_df.columns:
                new_order.append(col)

    # Check if new_order is empty before reordering columns
    if new_order:
        # Reorder columns based on new_order
        filtered_df = filtered_df.loc[:, pd.MultiIndex.from_tuples(new_order)]
                    
        # Sort DataFrame by the entire index
        filtered_df.sort_index(level=[f'base_{category}_fuel', 'lowModerateIncome_designation'], inplace=True)
    else:
        print("Warning: No matching columns found for reordering")

    return filtered_df

# Usage example (assuming df_basic_adoption_heating is properly formatted and loaded):
# df_multiIndex_heating_adoption = create_multiIndex_adoption_df(df_basic_adoption_heating, 8, 'heating')
# df_multiIndex_heating_adoption

In [None]:
# import pandas as pd

# def filter_columns(df):
#     keep_columns = [col for col in df.columns if 'Tier 1: Feasible' in col[1] or 'Tier 2: Feasible vs. Alternative' in col[1] or 'Tier 2: Feasible vs. Alternative' in col[1] or 'Tier 3: Subsidy-Dependent Feasibility' in col[1]]
#     return df.loc[:, keep_columns]

# def create_multiIndex_adoption_df(df, menu_mp, category):
#     # Explicitly set 'lowModerateIncome_designation' as a categorical type with order
#     income_categories = ['Low-Income', 'Moderate-Income', 'Middle-to-Upper-Income']

#     df['lowModerateIncome_designation'] = pd.Categorical(df['lowModerateIncome_designation'], categories=income_categories, ordered=True)
    
#     # Define the columns for adoption data
#     adoption_cols = [f'preIRA_mp{menu_mp}_{category}_adoption', 
#                      f'iraRef_mp{menu_mp}_{category}_adoption']

#     # Group by f'base_{category}_fuel' and 'lowModerateIncome_designation', calculate normalized counts
#     percentages_df = df.groupby([f'base_{category}_fuel', 'lowModerateIncome_designation'], observed=False)[adoption_cols].apply(
#         lambda x: x.apply(lambda y: y.value_counts(normalize=True))).unstack().fillna(0) * 100
#     percentages_df = percentages_df.round(2)

#     # Ensure 'Tier 1: Feasible' columns exist, set to 0 if they don't
#     for column in adoption_cols:
#         if (column, 'Tier 1: Feasible') not in percentages_df.columns:
#             percentages_df[(column, 'Tier 1: Feasible')] = 0
#         if (column, 'Tier 2: Feasible vs. Alternative') not in percentages_df.columns:
#             percentages_df[(column, 'Tier 2: Feasible vs. Alternative')] = 0
#         if (column, 'Tier 3: Subsidy-Dependent Feasibility') not in percentages_df.columns:
#             percentages_df[(column, 'Tier 3: Subsidy-Dependent Feasibility')] = 0


#     # Create 'Total Adoption with Subsidy' by combining related columns
#     for column in adoption_cols:
#         percentages_df[(column, 'Total Adoption with Subsidy')] = percentages_df[(column, 'Tier 1: Feasible')] + percentages_df.get((column, 'Tier 2: Feasible vs. Alternative'), 0) + percentages_df.get((column, 'Tier 3: Subsidy-Dependent Feasibility'), 0)

#     # Rebuild the column MultiIndex
#     percentages_df.columns = pd.MultiIndex.from_tuples(percentages_df.columns)
    
#     # Filter DataFrame to keep relevant columns only
#     filtered_df = filter_columns(percentages_df)

#     # Dynamically build the new column order based on existing columns
#     new_order = []
#     for prefix in ['preIRA_mp', 'iraRef_mp']:
#         for suffix in ['Tier 1: Feasible', 'Tier 2: Feasible vs. Alternative', 'Tier 3: Subsidy-Dependent Feasibility', 'Total Adoption with Subsidy']:
#             col = (f'{prefix}{menu_mp}_{category}_adoption', suffix)
#             if col in filtered_df.columns:
#                 new_order.append(col)

#     # Check if new_order is empty before reordering columns
#     if new_order:
#         # Reorder columns based on new_order
#         filtered_df = filtered_df.loc[:, pd.MultiIndex.from_tuples(new_order)]
                    
#         # Sort DataFrame by the entire index
#         filtered_df.sort_index(level=[f'base_{category}_fuel', 'lowModerateIncome_designation'], inplace=True)
#     else:
#         print("Warning: No matching columns found for reordering")

#     return filtered_df

# # Usage example (assuming df_basic_adoption_heating is properly formatted and loaded):
# # df_multiIndex_heating_adoption = create_multiIndex_adoption_df(df_basic_adoption_heating, 8, 'heating')
# # df_multiIndex_heating_adoption

In [None]:
# # LAST UPDATED SEPTEMBER 6, 2024

# import matplotlib.pyplot as plt
# import numpy as np

# def subplot_grid_adoption_vBar(dataframes, scenarios_list, subplot_positions, filter_fuel=None, x_labels=None, plot_titles=None, y_labels=None, suptitle=None, figure_size=(12, 10), sharex=False, sharey=False):
#     """
#     Creates a grid of subplots to visualize adoption rates across different scenarios, with an option to plot specific data related to adoption.
#     """
#     num_subplots = len(subplot_positions)
#     num_cols = max(pos[1] for pos in subplot_positions) + 1
#     num_rows = max(pos[0] for pos in subplot_positions) + 1

#     fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figure_size, sharex=sharex, sharey=sharey)
#     axes = np.array(axes).reshape(num_rows, num_cols)  # Ensure axes is always 2D

#     for idx, (df, scenarios) in enumerate(zip(dataframes, scenarios_list)):
#         # Apply the filter_fuel if provided
#         if filter_fuel:
#             df = df.loc[(df.index.get_level_values('base_fuel').isin(filter_fuel)), :]
        
#         pos = subplot_positions[idx]
#         ax = axes[pos[0], pos[1]]
#         x_label = x_labels[idx] if x_labels else ""
#         y_label = y_labels[idx] if y_labels else ""
#         title = plot_titles[idx] if plot_titles else ""

#         plot_adoption_rate_bar(df, scenarios, title, x_label, y_label, ax)

#     if suptitle:
#         plt.suptitle(suptitle, fontweight='bold')

#     # Define the relevant tiers to display in the legend
#     relevant_tiers = [
#         'Tier 1: Feasible',
#         'Tier 2: Feasible vs. Alternative',
#         'Tier 3: Subsidy-Dependent Feasibility'
#     ]

#     # Add a legend for only the relevant tiers
#     legend_handles = [plt.Rectangle((0, 0), 1, 1, color=color_mapping[label]) for label in relevant_tiers]
#     fig.legend(legend_handles, relevant_tiers, loc='lower center', ncol=len(relevant_tiers), prop={'size': 20}, labelspacing=0.5, bbox_to_anchor=(0.5, -0.05))

#     # Adjust the layout
#     plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the layout to leave space for the suptitle
#     plt.show()

# def plot_adoption_rate_bar(df, scenarios, title, x_label, y_label, ax):
#     # Assume the DataFrame 'df' has a suitable structure, similar to earlier examples
#     adoption_data = df.loc[:, df.columns.get_level_values(1).isin(['Tier 1: Feasible', 'Tier 2: Feasible vs. Alternative', 'Tier 3: Subsidy-Dependent Feasibility'])]
#     adoption_data.columns = adoption_data.columns.remove_unused_levels()

#     # Define the color mapping as specified
#     global color_mapping
#     color_mapping = {
#         'Tier 1: Feasible': 'steelblue',
#         'Tier 2: Feasible vs. Alternative': 'lightblue',
#         'Tier 3: Subsidy-Dependent Feasibility': 'lightsalmon'
#     }

#     # Plotting logic
#     n = len(adoption_data.index)
#     bar_width = 0.35  # Width of bars
#     index = list(range(n))  # Base index for bars

#     for scenario in scenarios:
#         if (scenario, 'Tier 1: Feasible') in adoption_data.columns and (scenario, 'Tier 2: Feasible vs. Alternative') in adoption_data.columns and (scenario, 'Tier 3: Subsidy-Dependent Feasibility') in adoption_data.columns:
#             tier3 = adoption_data[scenario, 'Tier 3: Subsidy-Dependent Feasibility'].values
#             tier2 = adoption_data[scenario, 'Tier 2: Feasible vs. Alternative'].values
#             tier1 = adoption_data[scenario, 'Tier 1: Feasible'].values
#             ax.bar(index, tier3, bar_width, color=color_mapping['Tier 3: Subsidy-Dependent Feasibility'], edgecolor='white')
#             ax.bar(index, tier2, bar_width, color=color_mapping['Tier 2: Feasible vs. Alternative'], edgecolor='white')
#             ax.bar(index, tier1, bar_width, color=color_mapping['Tier 1: Feasible'], edgecolor='white')
#             index = [i + bar_width for i in index]

#     ax.set_xlabel(x_label, fontweight='bold', fontsize=20)
#     ax.set_ylabel(y_label, fontweight='bold', fontsize=20)
#     ax.set_title(title, fontweight='bold', fontsize=20)
#     ax.set_xticks([i + bar_width / 2 for i in range(n)])
#     ax.set_xticklabels([f'{name[1]}' for name in adoption_data.index.tolist()], rotation=90, ha='right')

#     # Set font size for tick labels on the x-axis
#     ax.tick_params(axis='x', labelsize=20)

#     # Set font size for tick labels on the y-axis
#     ax.tick_params(axis='y', labelsize=20)

In [None]:
# # UPDATED SEPTEMBER 14, 2024 @ 12:46 AM
# def subplot_grid_adoption_vBar(dataframes, scenarios_list, subplot_positions, filter_fuel=None, x_labels=None, plot_titles=None, y_labels=None, suptitle=None, figure_size=(12, 10), sharex=False, sharey=False):
#     """
#     Creates a grid of subplots to visualize adoption rates across different scenarios, with an option to plot specific data related to adoption.

#     Parameters:
#     - dataframes (list of pd.DataFrame): List of pandas DataFrames, each DataFrame is assumed to be formatted for use in plot_adoption_rate_bar.
#     - scenarios_list (list of list): List of scenarios corresponding to each DataFrame.
#     - subplot_positions (list of tuples): Positions of subplots in the grid, specified as (row, col) tuples.
#     - filter_fuel (list of str, optional): List of fuel types to filter the DataFrames by 'base_fuel' column in a multi-index.
#     - x_labels (list of str, optional): Labels for the x-axis of each subplot.
#     - plot_titles (list of str, optional): Titles for each subplot.
#     - y_labels (list of str, optional): Labels for the y-axis of each subplot.
#     - suptitle (str, optional): A central title for the entire figure.
#     - figure_size (tuple, optional): Size of the entire figure (width, height) in inches.
#     - sharex (bool, optional): Whether subplots should share the same x-axis.
#     - sharey (bool, optional): Whether subplots should share the same y-axis.

#     Returns:
#     None. Displays the figure based on the provided parameters.
#     """
#     # Define the color mapping as specified
#     color_mapping = {
#         'Tier 1: Feasible': 'steelblue',
#         'Tier 2: Feasible vs. Alternative': 'lightblue',
#         'Tier 3: Subsidy-Dependent Feasibility': 'lightsalmon'
#     }

#     num_cols = max(pos[1] for pos in subplot_positions) + 1
#     num_rows = max(pos[0] for pos in subplot_positions) + 1

#     fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figure_size, sharex=sharex, sharey=sharey)
#     axes = np.array(axes).reshape(num_rows, num_cols)  # Ensure axes is always 2D

#     for idx, (df, scenarios) in enumerate(zip(dataframes, scenarios_list)):
#         # Apply the filter_fuel if provided
#         if filter_fuel:
#             df = df.loc[(df.index.get_level_values('base_fuel').isin(filter_fuel)), :]
        
#         pos = subplot_positions[idx]
#         ax = axes[pos[0], pos[1]]
#         x_label = x_labels[idx] if x_labels else ""
#         y_label = y_labels[idx] if y_labels else ""
#         title = plot_titles[idx] if plot_titles else ""

#         plot_adoption_rate_bar(df, scenarios, title, x_label, y_label, ax)

#     if suptitle:
#         plt.suptitle(suptitle, fontweight='bold')

#     # Add a legend for the color mapping at the bottom of the entire figure
#     legend_labels = list(color_mapping.keys())
#     legend_handles = [plt.Rectangle((0, 0), 1, 1, color=color_mapping[label]) for label in legend_labels]
            
#     fig.legend(legend_handles, legend_labels, loc='lower center', ncol=len(legend_labels), prop={'size': 20}, labelspacing=0.5, bbox_to_anchor=(0.5, -0.05))

#     # Adjust the layout
#     plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the layout to leave space for the suptitle
#     plt.show()

# def plot_adoption_rate_bar(df, scenarios, title, x_label, y_label, ax):
#     # Assume the DataFrame 'df' has a suitable structure, similar to earlier examples
#     adoption_data = df.loc[:, df.columns.get_level_values(1).isin(['Tier 1: Feasible', 'Tier 2: Feasible vs. Alternative', 'Tier 3: Subsidy-Dependent Feasibility'])]
#     adoption_data.columns = adoption_data.columns.remove_unused_levels()

#     # Define the color mapping as specified
#     global color_mapping
#     color_mapping = {
#         'Tier 1: Feasible': 'steelblue',
#         'Tier 2: Feasible vs. Alternative': 'lightblue',
#         'Tier 3: Subsidy-Dependent Feasibility': 'lightsalmon'
#     }

#     # Plotting logic
#     n = len(adoption_data.index)
#     bar_width = 0.35  # Width of bars
#     index = list(range(n))  # Base index for bars

#     for i, scenario in enumerate(scenarios):
#         if (scenario, 'Tier 1: Feasible') in adoption_data.columns and (scenario, 'Tier 2: Feasible vs. Alternative') in adoption_data.columns and (scenario, 'Tier 3: Subsidy-Dependent Feasibility') in adoption_data.columns:
#             tier1 = adoption_data[scenario, 'Tier 1: Feasible'].values
#             tier2 = adoption_data[scenario, 'Tier 2: Feasible vs. Alternative'].values
#             tier3 = adoption_data[scenario, 'Tier 3: Subsidy-Dependent Feasibility'].values

#             # Adjust the index for this scenario
#             scenario_index = np.array(index) + i * bar_width
            
#             # Plot the bars for the scenario
#             ax.bar(scenario_index, tier1, bar_width, color=color_mapping['Tier 1: Feasible'], edgecolor='white')
#             ax.bar(scenario_index, tier2, bar_width, bottom=tier1, color=color_mapping['Tier 2: Feasible vs. Alternative'], edgecolor='white')
#             ax.bar(scenario_index, tier3, bar_width, bottom=(tier1+tier2), color=color_mapping['Tier 3: Subsidy-Dependent Feasibility'], edgecolor='white')


#     ax.set_xlabel(x_label, fontweight='bold', fontsize=20)
#     ax.set_ylabel(y_label, fontweight='bold', fontsize=20)
#     ax.set_title(title, fontweight='bold', fontsize=20)
    
#     ax.set_xticks([i + bar_width / 2 for i in range(n)])
#     ax.set_xticklabels([f'{name[1]}' for name in adoption_data.index.tolist()], rotation=90, ha='right')

#     # Set font size for tick labels on the x-axis
#     ax.tick_params(axis='x', labelsize=20)

#     # Set font size for tick labels on the y-axis
#     ax.tick_params(axis='y', labelsize=20)

#     # Set y-ticks from 0 to 100 in steps of 10%
#     ax.set_yticks(np.arange(0, 101, 10))
#     ax.set_ylim(0, 100)

In [None]:
# UPDATED SEPTEMBER 14, 2024 @ 12:46 AM
def subplot_grid_adoption_vBar(dataframes, scenarios_list, subplot_positions, filter_fuel=None, x_labels=None, plot_titles=None, y_labels=None, suptitle=None, figure_size=(12, 10), sharex=False, sharey=False):
    """
    Creates a grid of subplots to visualize adoption rates across different scenarios, with an option to plot specific data related to adoption.

    Parameters:
    - dataframes (list of pd.DataFrame): List of pandas DataFrames, each DataFrame is assumed to be formatted for use in plot_adoption_rate_bar.
    - scenarios_list (list of list): List of scenarios corresponding to each DataFrame.
    - subplot_positions (list of tuples): Positions of subplots in the grid, specified as (row, col) tuples.
    - filter_fuel (list of str, optional): List of fuel types to filter the DataFrames by 'base_fuel' column in a multi-index.
    - x_labels (list of str, optional): Labels for the x-axis of each subplot.
    - plot_titles (list of str, optional): Titles for each subplot.
    - y_labels (list of str, optional): Labels for the y-axis of each subplot.
    - suptitle (str, optional): A central title for the entire figure.
    - figure_size (tuple, optional): Size of the entire figure (width, height) in inches.
    - sharex (bool, optional): Whether subplots should share the same x-axis.
    - sharey (bool, optional): Whether subplots should share the same y-axis.

    Returns:
    None. Displays the figure based on the provided parameters.
    """
    # Define the color mapping as specified
    color_mapping = {
        'Tier 1: Feasible': 'steelblue',
        'Tier 2: Feasible vs. Alternative': 'lightblue',
        'Tier 3: Subsidy-Dependent Feasibility': 'lightsalmon'
    }

    num_cols = max(pos[1] for pos in subplot_positions) + 1
    num_rows = max(pos[0] for pos in subplot_positions) + 1

    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figure_size, sharex=sharex, sharey=sharey)
    axes = np.array(axes).reshape(num_rows, num_cols)  # Ensure axes is always 2D

    for idx, (df, scenarios) in enumerate(zip(dataframes, scenarios_list)):
        # Apply the filter_fuel if provided
        if filter_fuel:
            df = df.loc[(df.index.get_level_values('base_fuel').isin(filter_fuel)), :]
        
        pos = subplot_positions[idx]
        ax = axes[pos[0], pos[1]]
        x_label = x_labels[idx] if x_labels else ""
        y_label = y_labels[idx] if y_labels else ""
        title = plot_titles[idx] if plot_titles else ""

        plot_adoption_rate_bar(df, scenarios, title, x_label, y_label, ax)

    if suptitle:
        plt.suptitle(suptitle, fontweight='bold')

    # Add a legend for the color mapping at the bottom of the entire figure
    legend_labels = list(color_mapping.keys())
    legend_handles = [plt.Rectangle((0, 0), 1, 1, color=color_mapping[label]) for label in legend_labels]
            
    fig.legend(legend_handles, legend_labels, loc='lower center', ncol=len(legend_labels), prop={'size': 20}, labelspacing=0.5, bbox_to_anchor=(0.5, -0.05))

    # Adjust the layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the layout to leave space for the suptitle
    plt.show()

def plot_adoption_rate_bar(df, scenarios, title, x_label, y_label, ax):
    # Assume the DataFrame 'df' has a suitable structure, similar to earlier examples
    adoption_data = df.loc[:, df.columns.get_level_values(1).isin(['Tier 1: Feasible', 'Tier 2: Feasible vs. Alternative', 'Tier 3: Subsidy-Dependent Feasibility'])]
    adoption_data.columns = adoption_data.columns.remove_unused_levels()

    # Define the color mapping as specified
    global color_mapping
    color_mapping = {
        'Tier 1: Feasible': 'steelblue',
        'Tier 2: Feasible vs. Alternative': 'lightblue',
        'Tier 3: Subsidy-Dependent Feasibility': 'lightsalmon'
    }

    # Plotting logic
    n = len(adoption_data.index)
    bar_width = 0.35  # Width of bars
    index = list(range(n))  # Base index for bars

    for i, scenario in enumerate(scenarios):
        if (scenario, 'Tier 1: Feasible') in adoption_data.columns and (scenario, 'Tier 2: Feasible vs. Alternative') in adoption_data.columns and (scenario, 'Tier 3: Subsidy-Dependent Feasibility') in adoption_data.columns:
            tier1 = adoption_data[scenario, 'Tier 1: Feasible'].values
            tier2 = adoption_data[scenario, 'Tier 2: Feasible vs. Alternative'].values
            tier3 = adoption_data[scenario, 'Tier 3: Subsidy-Dependent Feasibility'].values

            # Adjust the index for this scenario
            scenario_index = np.array(index) + i * bar_width
            
            # Plot the bars for the scenario
            ax.bar(scenario_index, tier1, bar_width, color=color_mapping['Tier 1: Feasible'], edgecolor='white')
            ax.bar(scenario_index, tier2, bar_width, bottom=tier1, color=color_mapping['Tier 2: Feasible vs. Alternative'], edgecolor='white')
            ax.bar(scenario_index, tier3, bar_width, bottom=(tier1+tier2), color=color_mapping['Tier 3: Subsidy-Dependent Feasibility'], edgecolor='white')


    ax.set_xlabel(x_label, fontweight='bold', fontsize=20)
    ax.set_ylabel(y_label, fontweight='bold', fontsize=20)
    ax.set_title(title, fontweight='bold', fontsize=20)
    
    ax.set_xticks([i + bar_width / 2 for i in range(n)])
    ax.set_xticklabels([f'{name[1]}' for name in adoption_data.index.tolist()], rotation=90, ha='right')

    # Set font size for tick labels on the x-axis
    ax.tick_params(axis='x', labelsize=20)

    # Set font size for tick labels on the y-axis
    ax.tick_params(axis='y', labelsize=20)

    # Set y-ticks from 0 to 100 in steps of 10%
    ax.set_yticks(np.arange(0, 101, 10))
    ax.set_ylim(0, 100)


# Adoption Rate Percentages

In [34]:
# UPDATED ON AUGUST 23, 2024 @ 2:00 AM
def format_group_percentages(counts, group):
    # Initialize total adoption with subsidy to 0
    total_adoption_with_subsidy = 0
    
    # Check and sum 'Tier 1: Feasible' and 'Tier 2: Feasible vs. Alternative' if they exist
    if 'Tier 1: Feasible' in counts.columns:
        total_adoption_with_subsidy += counts.loc[group, 'Tier 1: Feasible']
    if 'Tier 2: Feasible vs. Alternative' in counts.columns:
        total_adoption_with_subsidy += counts.loc[group, 'Tier 2: Feasible vs. Alternative']
    if 'Tier 3: Subsidy-Dependent Feasibility' in counts.columns:
        total_adoption_with_subsidy += counts.loc[group, 'Tier 3: Subsidy-Dependent Feasibility']

    # Format percentages, including checks for existence before accessing
    formatted_percentages = ', '.join(f"{decision_prefix}{counts.loc[group, decision]:.1f}%" 
                                      for decision, decision_prefix in [('Tier 1: Feasible', 'T1 '), ('Tier 2: Feasible vs. Alternative', 'T2 '),('Tier 3: Subsidy-Dependent Feasibility', 'T3 ')]
                                      if decision in counts.columns)
    formatted_percentages += f", TAS {total_adoption_with_subsidy:.1f}%"
    return formatted_percentages

def print_combined_adoption_decision_percentages(dataframes, data_columns, groups, groupby1, groupby2=None, filter_fuel=None):
    # Initialize a dictionary to hold the results
    results = {}
    
    # Add a key for overall percentages
    overall_key = "('Overall')"
    results[overall_key] = []

    # Iterate over each DataFrame and corresponding main_data_column
    for df, data_column in zip(dataframes, data_columns):
#         df_filtered = df.copy()

        # Filter out the 'Existing Equipment' category from the dataframe
        df_filtered = df[df[data_column] != 'Existing Equipment']

        # Apply the filter_fuel if provided
        if filter_fuel:
            df_filtered = df_filtered[df_filtered['base_fuel'].isin(filter_fuel)]
        
        # Calculate overall percentages for the entire data column
        overall_counts = df_filtered[data_column].value_counts(normalize=True) * 100
        # Calculate Total Adoption with Subsidy
        total_adoption_with_subsidy = overall_counts.get('Tier 1: Feasible', 0) + overall_counts.get('Tier 2: Feasible vs. Alternative', 0) + overall_counts.get('Tier 3: Subsidy-Dependent Feasibility', 0)

        overall_percentages = ', '.join(f"{decision_prefix}{overall_counts[decision]:.1f}%" 
                                        for decision, decision_prefix in [('Tier 1: Feasible', 'T1 '), ('Tier 2: Feasible vs. Alternative', 'T2 '),('Tier 3: Subsidy-Dependent Feasibility', 'T3 ')]
                                        if decision in overall_counts.index)
        overall_percentages += f", TAS {total_adoption_with_subsidy:.1f}%"
        results[overall_key].append(overall_percentages)
        
        if groups == 1 or groups == '1':
            # Calculate the percentages for each combination of categories
            counts = df_filtered.groupby(f'{groupby1}')[f'{data_column}'].value_counts(normalize=True).unstack() * 100
            for group in counts.index:
                key = f"('{groupby1}', '{group}')"
                if key not in results:
                    results[key] = []
                
                # Calculate and format percentages including Total Adoption with Subsidy
                formatted_percentages = format_group_percentages(counts, group)
                results[key].append(formatted_percentages)
                
        elif groups == 2 or groups == '2' and groupby2 is not None:
            # Calculate the percentages for each combination of categories
            counts = df_filtered.groupby([groupby1, groupby2])[f'{data_column}'].value_counts(normalize=True).unstack() * 100
            for group1_group2 in counts.index:
                key = f"('{group1_group2[0]}', '{group1_group2[1]}')"
                if key not in results:
                    results[key] = []

                # Calculate and format percentages including Total Adoption with Subsidy
                formatted_percentages = format_group_percentages(counts, group1_group2)
                results[key].append(formatted_percentages)
    
    # Print combined results for overall and then for each group
    for key, values in results.items():
        combined_values = ' | '.join(values)
        print(f"{key}: {combined_values}")