In [5]:
import os
import pandas as pd
import glob

In [6]:
def get_project_root():
    """
    Sets working directory to the notebook's location and returns the project root
    (assumes the notebook is inside the /notebooks directory of your project).
    :return: absolute path to the project root directory
    """
    try:
        # Use IPython magic to get the notebook's current directory
        from IPython import get_ipython
        ipython = get_ipython()

        if ipython:
            notebook_dir = ipython.run_line_magic("pwd", "")
            os.chdir(notebook_dir)
            
    except Exception as e:
        print(f"Warning: Failed to change working directory: {e}")

    # Project root is assumed to be one level up from notebooks/
    working_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
    print(f"Working directory set to: {working_dir}")
    return working_dir

def create_output_dirs(base_dir, subfolders):
    """
    Create data output subfolders like Sector and Subsector inside processed/.

    :param base_dir: base path where to create folders (e.g. data/processed)
    :param subfolders: list of folder names
    :return: dict of full paths {folder_name: full_path}
    """
    os.makedirs(base_dir, exist_ok=True)
    paths = {}
    for folder in subfolders:
        path = os.path.join(base_dir, folder)
        os.makedirs(path, exist_ok=True)
        paths[folder] = path
    return paths

In [None]:
# Path to cleaned sector files
project_root = get_project_root()

subfolders = ['By_State_Sectors']
state_path = os.path.join(project_root, 'data', 'processed')
# Create output directories
paths = create_output_dirs(state_path, subfolders)


Working directory set to: c:\Users\jayja\Documents\projects\ghg_explorer\ghg_explorer


In [10]:
enriched_folder = os.path.join(project_root, 'data', 'raw', 'Sector', 'enriched')

By_State = os.path.join(enriched_folder, "*.xlsx")

# Load and combine all enriched Excel files
files = glob.glob(By_State)
df_list = [pd.read_excel(f) for f in files]
full_df = pd.concat(df_list, ignore_index=True)



Count of facilities by State

In [11]:
facilities_by_state = (
    full_df.groupby(['STATE', 'SUBSECTOR_TYPE'])
    .size()
    .reset_index(name='Facility Count')
)

Emissions by State

In [12]:
emissions_by_state = (
    full_df.groupby('STATE')['GHG QUANTITY (METRIC TONS CO2e)']
    .sum()
    .reset_index()
    .sort_values(by='GHG QUANTITY (METRIC TONS CO2e)', ascending=False)
)

Count by Sector

In [13]:
facilities_by_sector = (
    full_df.groupby(['STATE', 'INDUSTRY TYPE'])
    .size()
    .reset_index(name='Facility Count')
)

In [15]:
facilities_by_state.to_excel(os.path.join(paths['By_State_Sectors'], 'facilities_by_state.xlsx'), index=False)
emissions_by_state.to_excel(os.path.join(paths['By_State_Sectors'], 'emissions_by_state.xlsx'), index=False)    
facilities_by_sector.to_excel(os.path.join(paths['By_State_Sectors'], 'facilities_by_sector.xlsx'), index=False)
print("Data processing complete. Files saved in:", paths['By_State_Sectors'])
# End of script

Data processing complete. Files saved in: c:\Users\jayja\Documents\projects\ghg_explorer\ghg_explorer\data\processed\By_State_Sectors
