# GEP Results Preview

This script visualizes results from run_renewable_energy_production.py 

In [5]:
import glob
import numpy as np
import os
import pandas as pd

In [2]:
# set dirs
data_dir = '../data'
input_dir = os.path.join(data_dir, 'results')

In [6]:
def summarize_gep_in_directory(input_dir):
    # Get a list of CSV files in the directory
    csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
    
    # Loop through each CSV file
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")
            continue
        
        # Check if the required columns exist
        required_cols = {'Year', 'gep', 'Country_Name'}
        if not required_cols.issubset(df.columns):
            print(f"Skipping file {csv_file}: required columns {required_cols} missing.")
            continue
        
        summary_records = []
        
        # Group the DataFrame by 'Year'
        for year, group in df.groupby('Year'):
            # Convert the 'gep' column to a NumPy array
            gep_values = group['gep'].values
            # Get the array of countries for indexing
            countries = group['Country_Name'].values
            
            # Count the number of unique countries for the year
            num_countries = group['Country_Name'].nunique()
            
            # Compute summary statistics using NumPy
            max_gep = np.max(gep_values)
            max_idx = np.argmax(gep_values)
            max_country = countries[max_idx]
            
            min_gep = np.min(gep_values)
            min_idx = np.argmin(gep_values)
            min_country = countries[min_idx]
            
            med_gep = np.median(gep_values)
            # For median country, find the index where the value is closest to the median
            median_idx = np.argmin(np.abs(gep_values - med_gep))
            median_country = countries[median_idx]
            
            total_gep = np.sum(gep_values)
            
            # Append the summary statistics in the specified order
            summary_records.append({
                'Year': year,
                'num_countries': num_countries,
                'max_gep': max_gep,
                'min_gep': min_gep,
                'med_gep': med_gep,
                'total_gep': total_gep,
                'max_country': max_country,
                'min_country': min_country,
                'median_country': median_country
            })
        
        # Create a DataFrame from the summary records and order columns accordingly
        summary_df = pd.DataFrame(summary_records)
        summary_df = summary_df[['Year', 'num_countries', 'max_gep', 'min_gep', 
                                 'med_gep', 'total_gep', 'max_country', 'min_country', 
                                 'median_country']]
        
        # set output dir
        out_dir = os.path.join(data_dir, 'summary')
        if out_dir:
            os.makedirs(out_dir, exist_ok=True) # create dir if needed

        # Build the output filename using the first three letters of the input filename
        base_filename = os.path.basename(csv_file)
        first_three = base_filename[:3]
        output_filename = f"{first_three}_summary.csv"
        output_path = os.path.join(out_dir, output_filename)
        
        summary_df.to_csv(output_path, index=False)
        print(f"Summary saved to {output_path}")

In [7]:
# call summarize function
summarize_gep_in_directory(input_dir)

Summary saved to ../data/summary/geo_summary.csv
Summary saved to ../data/summary/sol_summary.csv
Summary saved to ../data/summary/win_summary.csv
