In [10]:
import os
import pandas as pd
from pandas import Series, DataFrame
import country_converter as coco
import numpy as np
import copy


Data source
- https://sandia.gov/ess-ssl/gesdb/public/statistics.html

Find file path

In [11]:
## Get current working directory
print("Current working directory:", os.getcwd())

## Find path to raw data (for the raw files)
raw_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'raw_data'))

## Find path to folder for saving cleaned csv
cleaned_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'cleaned_data'))

Current working directory: /Users/jennagreene/Documents/GitHub/HATCH_data/reading_files


In [12]:
# Find and read the CSV file from the raw_data folder
target_file = 'Cummulative Sum by Year_Full Data_data.csv'
target_file_path = os.path.join(raw_data_path, target_file)

In [13]:
doe = pd.read_csv(target_file_path, usecols=[0,1,2,9,10])
doe = doe.drop_duplicates()
doe

Unnamed: 0,Year of Project Year,Technology Mid-Type,Country,Rated Capacity (kWh),Rated Power (kW)
0,2007,Technology Mid-Type Unknown,United States,32000.0,64000
1,2009,Technology Mid-Type Unknown,Sweden,,10000
2,2011,Technology Mid-Type Unknown,United States,,430
3,2011,Technology Mid-Type Unknown,United States,,1500
4,2013,Technology Mid-Type Unknown,Spain,,0
...,...,...,...,...,...
1620,2018,Pumped hydro storage,Indonesia,,1040000
1621,2019,Pumped hydro storage,Australia,1500000.0,250000
1622,2020,Pumped hydro storage,Ireland,9000.0,1500
1623,2020,Pumped hydro storage,Japan,,200000


Replace storage tech names

In [14]:
doe.replace({'Lithium-ion battery':'Lithium-ion battery storage', 'Latent heat':'Latent heat storage', 
             'Sensible heat':'Sensible heat storage','Flow battery':'Flow battery storage',
             'Sodium-based battery':'Sodium-based battery storage',
             'Heat thermal storage':'Heat thermal battery storage',
             'Lead-acid battery':'Lead-acid battery storage','Flywheel':'Flywheel battery storage'}, inplace=True)


In [15]:
tech_list = sorted(list(set(doe['Technology Mid-Type'])))
tech_list.remove('Technology Mid-Type Unknown')
tech_list

['Compressed air energy storage',
 'Electro-chemical capacitor',
 'Flow battery storage',
 'Flywheel battery storage',
 'Heat thermal battery storage',
 'Hydrogen storage',
 'Latent heat storage',
 'Lead-acid battery storage',
 'Lithium-ion battery storage',
 'Nickel-based battery',
 'Pumped hydro storage',
 'Sensible heat storage',
 'Sodium-based battery storage',
 'Zinc-based battery']

In [16]:
df = doe[(doe['Technology Mid-Type'] == 'Compressed air energy storage') & (doe['Country'] == 'United States')]
df

Unnamed: 0,Year of Project Year,Technology Mid-Type,Country,Rated Capacity (kWh),Rated Power (kW)
1188,1991,Compressed air energy storage,United States,2860000.0,110000
1189,2012,Compressed air energy storage,United States,500000.0,2000
1192,2013,Compressed air energy storage,United States,30432000.0,317000
1193,2013,Compressed air energy storage,United States,60.0,80
1194,2013,Compressed air energy storage,United States,1500.0,1500
1198,2015,Compressed air energy storage,United States,,0
1203,2020,Compressed air energy storage,United States,3000000.0,300000


Loop through each technology, get data for each country, transpose and add characteristics

In [30]:
def read_doe(tech_name):
    # Filter the DataFrame to include only rows with the specified technology mid-type
    df = doe[doe['Technology Mid-Type'] == tech_name]
    
    # Extract unique country names from the filtered DataFrame
    country_list = list(set(df['Country']))
    
    # Iterate through each country in the country list
    for country in country_list:
        # Select rows corresponding to the current country
        df_country = df[df['Country']== country]
        
        # Reset the index of the country-specific DataFrame
        df_country.reset_index(drop=True, inplace=True)
        
        # Extract and sort unique years from the 'Year of Project Year' column
        years = sorted(list(set(df_country['Year of Project Year'])))
        
        # Initialize dictionaries to store power and capacity data for each year
        year_dict = {}
        power_dict = copy.deepcopy(year_dict)
        capacity_dict = copy.deepcopy(year_dict)
        
        # Iterate through each row of the country-specific DataFrame
        for idx in df_country.index:
            # Extract power, capacity, and year from the current row
            power = df_country['Rated Power (kW)'].iloc[idx]
            capacity = df_country['Rated Capacity (kWh)'].iloc[idx]
            year = df_country['Year of Project Year'].iloc[idx]
            
            # Check if the year is present in the dictionary
            if year in power_dict:
                # Append power and capacity to respective lists in the year dictionary
                power_dict[year].append(power)
                capacity_dict[year].append(capacity)
            else:
                # If the year is not present, initialize empty lists for power and capacity
                power_dict[year] = [power]
                capacity_dict[year] = [capacity]
        
        # Calculate cumulative power and capacity for each year
        power_by_year = [np.nansum(power_dict[year]) for year in power_dict]
        capacity_by_year = [np.nansum(capacity_dict[year]) for year in capacity_dict]
        
        # Create a new DataFrame with cumulative power and capacity by year
        new_df = pd.DataFrame(power_by_year, years)
        new_df[0] = new_df[0].cumsum()
        new_df['Cumulative Rated Capacity'] = capacity_by_year
        new_df['Cumulative Rated Capacity'] = new_df['Cumulative Rated Capacity'].cumsum()
        new_df = new_df.transpose()
        
        # Add metadata columns to the new DataFrame
        new_df['Technology Name'] = tech_name.title()
        new_df['Country Name'] = country
        new_df['Unit'] = ['kW', 'kWh']
        new_df['Data Source'] = 'GESDB'
        new_df['Metric'] = ['Cumulative Rated Power', 'Cumulative Rated Capacity']
        new_df['Spatial Scale'] = 'National'
        new_df['Country Code'] = coco.convert(names=country, to='iso2')
        new_df['ID'] = new_df['Technology Name'] + '_' + new_df['Metric'] + '_' + new_df['Country Code']
        new_df.set_index('ID', drop=True, inplace=True)
        

        # Define the path and file name for saving the processed DataFrame
        output_file = 'doe_' + tech_name + country +'.csv'
        output_file_path = os.path.join(cleaned_data_path, output_file)
        
        # Save the processed DataFrame to a CSV file
        new_df.to_csv(output_file_path)
        
        # Define the file name for saving the processed DataFrame
        
        # Save the new DataFrame to a CSV file
        new_df.to_csv(output_file_path)

In [31]:
for tech in tech_list:
    read_doe(tech)