## Cleaning HATCH dataset for: 
- Alignment with IIAC conventions so that it can be displayed through the Scenario Explorer
- Alignment of variable names


In [31]:
# Import packages

import requests
import numpy as np
import yaml
import pandas as pd
import pyam

from countrycode import countrycode

### Import files from GitHub

Hosted in Jenna's GitHub repository


In [32]:
# YAML url
yaml_url = 'https://raw.githubusercontent.com/jennagreene22/HATCH_data/main/Tech_Growth_V1.5_variabledescriptions_Clean.yaml'

# Get YAML content using the requests package
yaml_response = requests.get(yaml_url)

# Check if the request was successful (status code 200)
if yaml_response.status_code == 200:
    # Parse the YAML content
    variable_descriptions = yaml.safe_load(yaml_response.text)
else: 
    print("Could not read YAML File.")

## Read in CSV file from Github
url = "https://raw.githubusercontent.com/jennagreene22/HATCH_data/main/all_tech_version_1.5_2024.csv"
#url = "https://raw.githubusercontent.com/jennagreene22/HATCH_data/main/all_tech_version_1.5.csv"
df = pd.read_csv(url, low_memory = False)

In [33]:
exclude_columns = ['ID', 'Spatial Scale', 'Country Code', 
'Country Name', 'Technology Name', 'Metric', 'Unit', 'Data Source', 'Long Technology Name' ]

# Identify columns to convert
columns_to_convert = [col for col in df.columns if col not in exclude_columns]

# Convert the identified columns to float
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')


### Clean data by removing duplicates and non-1.5 data
- remove data on cost
- remove duplicate country x technology pairs

In [34]:
# Remove all cost data

df = df[~df['Metric'].str.contains('Cost', case=False, na=False)]
df = df[~df['Metric'].str.contains('Price', case=False, na=False)]
df = df[~df['Metric'].str.contains('Levelized Cost of Energy', case=False, na=False)]


In [35]:
# remove duplicates by selecting for the longer time series

def get_row_length(row):
    # only get the value of the row length for the time series, not the time series characteristics
    # these are the first ten columns
    return len([value for value in row[10:] if pd.notna(value)])

# Access row length for each row
df['RowLength'] = df.apply(get_row_length, axis=1)

# Sort the DataFrame by 'RowLength' in descending order
df = df.sort_values(by='RowLength', ascending=False)

# Drop duplicates based on 'Variable' and 'Region', keeping the one with the longer time series
df = df.drop_duplicates(subset=['Long Technology Name', 'Country Code'], keep='first')

# Remove the 'RowLength' column
df = df.drop('RowLength', axis=1)


In [36]:
# For zinc and nickel production, drop USGS values

tech_name_to_drop = "Zinc Production"
region_to_drop = "US"
source_to_drop = "USGS"

# Create condition where all three conditions above are met
condition = (df["Technology Name"] == tech_name_to_drop) & (df["Country Code"] == region_to_drop) & (df["Data Source"] == source_to_drop)

# Drop rows that meet those three conditions
df = df.loc[~condition]

# Repeat for nickel production
tech_name_to_drop = "Nickel Production"
region_to_drop = "US"
source_to_drop = "Mitchell"

# Create condition where all three conditions above are met
condition = (df["Technology Name"] == tech_name_to_drop) & (df["Country Code"] == region_to_drop) & (df["Data Source"] == source_to_drop)

# Drop rows that meet those three conditions
df = df.loc[~condition]


## Drop if the source is "Schmidt & Staffell" - these are cost data
source_to_drop = "Schmidt & Staffell"
condition = (df["Data Source"] == source_to_drop)
df = df.loc[~condition]

## Drop if the source is "BTS" because the US pipeline data is included in the national pipeline dataframe
source_to_drop = "BTS"
condition = (df["Data Source"] == source_to_drop)
df = df.loc[~condition]

## Change Variable Names
 - align variable names in variable description file with v1.5 file
 - drop variables that should not be public
 - drop any variables with inconsistent units

In [37]:
df = df.rename(columns={"Country Code": "Region", "Long Technology Name": "Variable"})

In [38]:
# Change variable names and metrics that are only different by syntax
df["Variable"] = df["Variable"].str.replace('Annual production', 'Annual Production')
df["Variable"] = df["Variable"].str.replace('Share of Households', 'Share of Market')

df["Metric"] = df["Metric"].str.replace('Annual production', 'Annual Production')
df["Metric"] = df["Metric"].str.replace('Share of Households', 'Share of Market')

In [39]:
# Drop global Crude Oil for inconsistent formatting
df = df[~((df['Variable'] == 'Annual Production|Crude Oil') & (df['Region'] == 'World'))]


In [40]:
# Drop Boeing Point
df = df[~((df['Country Name'] == 'Boeing'))]


In [41]:
df = df.drop('Unnamed: 0', axis=1)

##### Align variable names in two documents

In [42]:
replacement_var_names = {" Average Capacity of Unit Additions|Fluid Catalytic Cracking Refineries": "Average Capacity of Unit Additions|Fluid Catalytic Cracking Refineries",
                        "Annual Production|Milk Production": "Annual Production|Milk", 
                        "Annual Production|Beer Production": "Annual Production|Beer", 
                        "Annual Production|Nickel Production": "Annual Production|Nickel", 
                        "Annual Production|Oil Production": "Annual Production|Oil", 
                        "Annual Production|Primary Aluminum Production": "Annual Production|Primary Aluminum", 
                        "Annual Production|Raw Steel Production": "Annual Production|Raw Steel",
                        "Annual Production|Salt Production": "Annual Production|Salt",
                        "Annual Production|Aquaculture Production": "Annual Production|Aquaculture",
                        "Annual Production|Acrylic Fiber": "Annual Production|Acrylic Fiber",
                        "Annual Production|Acrylonitrile": "Annual Production|Acrylonitrile",
                        "Annual Production|Coal Production": "Annual Production|Coal",
                        "Annual Production|Nuclear Energy": "Annual Production|Electricity|Nuclear",
                        "Annual Production|Liquefied Natural Gas": "Annual Production|Liquefied Natural Gas Exports",
                        "Annual Production|Primary Bauxite Production":"Annual Production|Primary Bauxite",
                        "Annual Production|Shale Production": "Annual Production|Shale Oil",
                        "Annual Production|Artificial and Synthetic Fibers" : "Annual Production|Synthetic Filaments",
                        "Annual Production|Biofuels Production": "Annual Production|All Biofuels",
                        "Annual Production|Sugar Output": "Annual Production|Cane Sugar", 
                        "Annual Production|Caustic Soda Acid" : "Annual Production|Caustic Soda", 
                        "Annual Production|Crude Petroleum": "Annual Production|Crude Oil",
                        "Total Number|Shotgun Sanger DNA Sequencing": "Annual Production|Shotgun Sanger DNA Sequencing",
                        "Computing Capacity|Processor Performance":"Computing Capacity|Processing Performance",
                        "Computing Capacity|Transistors per Microprocessor": "Computing Capacity|Transistors per Microprocessor Chip",
                        "Total Length|Natural Gas Pipeline": "Cumulative Number of Units|Natural Gas Pipelines",
                        "Total Length|Oil Pipeline": "Cumulative Number of Units|Oil Pipelines",
                        "Total Length|Public Roads":"Cumulative Number of Units|Public Roads",
                        "Total Length|Railroad": "Cumulative Number of Units|Railroad Tracks", 
                        "Cumulative Length|Railroad":"Cumulative Number of Units|Railroad Tracks",
                        "Total Number|Television":"Number of Units|Televisions",
                        "Cumulative Rated Capacity|Compressed Air Energy Storage":"Cumulative Rated Capacity|Electricity|Compressed Air Energy Storage",
                        "Cumulative Rated Capacity|Electro-Chemical Capacitor": "Cumulative Rated Capacity|Electricity|Electro-Chemical Capacitor",
                        "Cumulative Rated Capacity|Flow Battery Storage": "Cumulative Rated Capacity|Electricity|Flow Battery Storage",
                        "Cumulative Rated Capacity|Flywheel Battery Storage": "Cumulative Rated Capacity|Electricity|Flywheel Battery Storage",
                        "Cumulative Rated Capacity|Lead-Acid Battery Storage": "Cumulative Rated Capacity|Electricity|Lead-Acid Battery Storage",
                        "Cumulative Rated Capacity|Lithium-Ion Battery Storage": "Cumulative Rated Capacity|Electricity|Lithium-Ion Battery Storage",
                        "Cumulative Rated Capacity|Pumped Hydro Storage": "Cumulative Rated Capacity|Electricity|Pumped Hydro Storage",
                        "Cumulative Rated Capacity|Sodium-Based Battery Storage": "Cumulative Rated Capacity|Electricity|Sodium-Based Battery Storage",
                        "Cumulative Rated Capacity|Zinc-Based Battery": "Cumulative Rated Capacity|Electricity|Zinc-Based Battery Storage",
                        "Cumulative Rated Capacity|Nickel-Based Battery":"Cumulative Rated Capacity|Nickel-Based Battery Storage",
                        "Cumulative Rated Capacity|Heat Thermal Battery Storage":"Cumulative Rated Capacity|Thermal Energy Storage|Heat Thermal Battery Storage",
                        "Cumulative Rated Capacity|Latent Heat Storage": "Cumulative Rated Capacity|Thermal Energy Storage|Latent Heat Storage",
                        "Cumulative Rated Capacity|Sensible Heat Storage": "Cumulative Rated Capacity|Thermal Energy Storage|Sensible Heat Storage",
                        "Cumulative Rated Power|Compressed Air Energy Storage": "Cumulative Rated Power|Electricity|Compressed Air Energy Storage",
                        "Cumulative Rated Power|Electro-Chemical Capacitor":"Cumulative Rated Power|Electricity|Electro-Chemical Capacitor",
                        "Cumulative Rated Power|Flow Battery Storage":"Cumulative Rated Power|Electricity|Flow Battery Storage",
                        "Cumulative Rated Power|Flywheel Battery Storage":"Cumulative Rated Power|Electricity|Flywheel Battery Storage",
                        "Cumulative Rated Power|Lead-Acid Battery Storage":"Cumulative Rated Power|Electricity|Lead-Acid Battery Storage",
                        "Cumulative Rated Power|Lithium-Ion Battery Storage":"Cumulative Rated Power|Electricity|Lithium-Ion Battery Storage",
                        "Cumulative Rated Power|Pumped Hydro Storage":"Cumulative Rated Power|Electricity|Pumped Hydro Storage",
                        "Cumulative Rated Power|Sodium-Based Battery Storage": "Cumulative Rated Power|Electricity|Sodium-Based Battery Storage",
                        "Cumulative Rated Power|Zinc-Based Battery": "Cumulative Rated Power|Electricity|Zinc-Based Battery Storage",
                        "Cumulative Rated Power|Nickel-Based Battery":"Cumulative Rated Power|Nickel-Based Battery Storage",
                        "Cumulative Rated Power|Heat Thermal Battery Storage":"Cumulative Rated Power|Thermal Energy Storage|Heat Thermal Battery Storage",
                        "Cumulative Rated Power|Latent Heat Storage":"Cumulative Rated Power|Thermal Energy Storage|Latent Heat Storage",
                        "Cumulative Rated Power|Sensible Heat Storage": "Cumulative Rated Power|Thermal Energy Storage|Sensible Heat Storage",
                        "Cumulative Total Capacity|Compact Fluorescent Light Bulbs": "Cumulative Total Capacity|Compact Flourescent Light bulbs",
                        "Total Capacity|Desalination Capacity": "Cumulative Total Capacity|Desalination",
                        "Cumulative Total Capacity|Natural Gas Power": "Cumulative Total Capacity|Electricity|Natural Gas",
                        "Cumulative Total Capacity|Jet Aircraft": "Cumulative Total Capacity|Jet Aircrafts",
                        "Installed Capacity|Carbon Capture and Sequestration": "Installed Capacity|Carbon Capture & Sequestration",
                        "Installed electricity capacity|Biogas": "Installed Capacity|Electricity|Biogas",
                        "Installed Capacity|Concentrated Solar Power": "Installed Capacity|Electricity|Concentrated Solar",
                        "Installed electricity capacity|Geothermal Energy": "Installed Capacity|Electricity|Geothermal Energy",
                        "Installed electricity capacity|Liquid Biofuels": "Installed Capacity|Electricity|Liquid Biofuels", 
                        "Installed electricity capacity|Marine Energy": "Installed Capacity|Electricity|Marine Energy",
                        "Installed electricity capacity|Offshore Wind Energy": "Installed Capacity|Electricity|Offshore Wind",
                        "Installed electricity capacity|Onshore Wind Energy": "Installed Capacity|Electricity|Onshore Wind",
                        "Installed electricity capacity|Solar Photovoltaic": "Installed Capacity|Electricity|Solar Photovoltaic", 
                        "Installed electricity capacity|Solid Biofuels": "Installed Capacity|Electricity|Solid Biomass",
                        "Installed electricity capacity|Solar Thermal Energy": "Installed Capacity|Solar Thermal Energy",
                        "Total Length|Canals": "Number of Units|Canals",
                        "Total Number|Cellphones": "Number of Units|Cellphones",
                        "Total Number|Crop Harvester":"Number of Units|Crop Harvester",
                        "Total Number|Nuclear Weapons": "Number of Units|Nuclear Weapons",
                        "Total Number|Passenger Cars": "Number of Units|Passenger Cars",
                        "Total Number|Postal Traffic": "Number of Units|Postal Traffic",
                        "Total Number|Radio":"Number of Units|Radios",
                        "Total Number|Space Launches": "Number of Units|Space Launches",
                        "Total Number|Passenger Vehicles": "Number of Units|Passenger Cars", 
                        "Total Number|Steamships": "Number of Units|Steamships", 
                        "Cumulative Number of Units|High Speed Rail": "Number of Units|High Speed Rail",
                        "Total Number|Objects Launched Into Space": "Number of Units|Space Launches", 
                        "Total Capacity|Oil Refining Capacity":"Total Capacity|Oil Refining", 
                        ""
                        "Cumulative total capacity|Steamships": "Number of Units|Steamships",
                        "Total Number|Telegraph Traffic": "Number of Units|Telegraph Traffic",
                        "Total Number|Telephones": "Number of Units|Telephones",
                        "Share of Market|Cable TV":"Share of Market|Cable Television",
                        "Share of Market|Colour TV":"Share of Market|Colour Television",
                        "Share of Market|Disk Brakes": "Share of Market|Disc Brakes",
                        "Share of Market|Freezer": "Share of Market|Freezers",
                        "Share of Market|Households With Only Mobile Phones (No Landlines)":"Share of Market|Households with Only Mobile Phones",
                        "Share of Market|Iron": "Share of Market|Irons",
                        "Share of Boilers|Nox Pollution Controls (Boilers)": "Share of Market|Nox Pollution Control Technologies",
                        "Share of Market|Real-Time Gross Settlement Adoption":"Share of Market|Real-time Gross Settlement",
                        "Share of Market|Stove": "Share of Market|Stoves",
                        "Share of Market|Videocassette Recorder": "Share of Market|Videocassette recorder",
                        "Share of Population|DTP1 Vaccine": "Share of Population|Diphtheria and Tetanus Vaccine First Dose",
                        "Share of Population|DTP3 Vaccine": "Share of Population|Diphtheria and Tetanus Vaccine Third Dose",
                        "Share of Population|ROTAC Vaccine": "Share of Population|Rotavirus Vaccine",
                        "Share of Population|BCG Vaccine": "Share of Population|Tuberculosis Vaccine", 
                        "Share of Population|RCV1 Vaccine": "Share of Population|Rubella Vaccine First Dose",
                        "Share of Population|YFV Vaccine": "Share of Population|Yellow Fever Vaccine", 
                        "Share of Population|POL3 Vaccine": "Share of Population|Polio Vaccine Third Dose", 
                        "Share of Population|PCV3 Vaccine": "Share of Population|Pneumococcal Conjugate Vaccine", 
                        "Share of Population|MCV1 Vaccine": "Share of Population|Measles Vaccine First Dose",
                        "Share of Population|MCV2 Vaccine": "Share of Population|Measles Vaccine Second Dose",
                        "Share of Population|IPV1 Vaccine": "Share of Population|Inactivated Polio Vaccine", 
                        "Share of Population|HEPB3 Vaccine": "Share of Population|Hepatitis B Vaccine Third Dose", 
                        "Share of Population|HEPBB Vaccine": "Share of Population|Hepatitis B Vaccine First Dose",
                        "Share of Population|HIB3 Vaccine": "Share of Population|Haemophilus Influenzae Vaccine Third Dose", 
                        "Cumulative Total Capacity|Oil Refining Capacity": "Total Capacity|Oil Refining",
                        "Annual Production|BisphenolA": "Annual Production|Bisphenol A",
                        "Annual Production|Cane Sugar Production": "Annual Production|Cane Sugar",
                        "Annual Production|Capture Fisheries Production": "Annual Production|Capture Fisheries",
                        "Annual Production|Cement Production": "Annual Production|Cement",
                        "Annual Production|Cobalt Mine Production": "Annual Production|Cobalt",
                        "Annual Production|Construction|Sand and Gravel Construction": "Annual Production|Sand and Gravel|Construction",
                        "Annual Production|Copper Mining": "Annual Production|Copper|Mining",
                        "Annual Production|Copper Refining": "Annual Production|Copper|Refining",
                        "Annual Production|Ethyl Alcohol": "Annual Production|Ethanol",
                        "Annual Production|Gold Production": "Annual Production|Gold",
                        "Annual Production|Graphite Mine Production": "Annual Production|Graphite",
                        "Annual Production|Industrial|Sand and Gravel Industrial": "Annual Production|Sand and Gravel|Industrial",
                        "Annual Production|Lead Mines": "Annual Production|Lead",
                        "Annual Production|Primary Copper Production": "Annual Production|Primary Copper",
                        "Annual Production|Zinc Production": "Annual Production|Zinc", 
                        "Annual Production|Tin Production": "Annual Production|Tin", 
                        "Annual Production|Silver Production": "Annual Production|Silver"}

df['Variable'] = df['Variable'].replace(replacement_var_names)

In [43]:
df['Variable'].unique()

array(['Annual Production|Cane Sugar', 'Annual Production|Beer',
       'Cumulative Number of Units|Railroad Tracks',
       'Number of Units|Radios', 'Number of Units|Telegraph Traffic',
       'Number of Units|Postal Traffic', 'Annual Production|Gold',
       'Annual Production|Zinc', 'Annual Production|Crude Oil',
       'Cumulative Total Capacity|Bicycles', 'Number of Units|Steamships',
       'Annual Production|Silver', 'Annual Production|Sulphuric Acid',
       'Cumulative Number of Units|Public Roads',
       'Cumulative Number of Units|Oil Pipelines',
       'Cumulative Number of Units|Natural Gas Pipelines',
       'Annual Production|Primary Bauxite',
       'Annual Production|Rare Earth Mine Production',
       'Annual Production|Cadmium Refining',
       'Annual Production|Primary Aluminum', 'Annual Production|Nickel',
       'Annual Production|Cement', 'Annual Production|Raw Steel',
       'Annual Production|Iron Ore', 'Annual Production|Graphite',
       'Annual Production

### Change Unit Names 
- per Daniel's recommendation


In [44]:
rename_units = {
    "Yearly Production (Mil. lbs)": "million pounds",
    "Yearly  Production (Billion Pounds)": "billion pounds",
    "In Thousands Of Hectolitres": "thousand hectolitres",
    "In Thousand Hectolitres": "thousand hectolitres",
    "Petajoules": "Petajoule",
    "petajoules": "Petajoule",
    "Thousand tons nitrogen equivalent": "thousand tons of nitrogen equivalent",
    "Metric tons": "metric tons",
    "Thousands Metric Tons": "thousand metric tons",
    "Thousand Metric Tons": "thousand metric tons",
    "Yearly Production (Million Gallons)": "million gallons",
    "Million tonnes": "million metric tons",
    "Million Pounds": "million pounds",
    "Million pounds": "million pounds",
    "Thousand tonnes": "thousand metric tons",
    "In Thousand Metric Tons": "thousand metric tons",
    "Tonnes": "metric tons",
    "Metric Tons": "metric tons",
    "Terawatt-hours": "TWh",
    "Miles": "miles",
    "Kilometers": "kilometer",
    "Twh": "TWh",
    "Percentage": "%",
    "Share of acres planted": "%",
    "Number in Use": "-",
    "cumulative acres planted": "Acres",
    "million instructions/second": "Million instructions per second (MIPS)",
    "GB/month": "Gigabytes per month",
    "calculations per second/1000 USD": "Calculations per second per $1,000",
    "bits/second": "Bits/second",
    "million cubic meters/day": "Million cubic meters per day",
    "billion cubic metres": "billion cubic meters"
}

# Apply the rename_units dictionary
df['Unit'] = df['Unit'].replace(rename_units)

## Fixing Units
For milk production, Mitchell data is inconsistent by the country. This is the standardization.

For nickel production, Mitchell and USGS data is inconsistent.

For ethanol, the country level data is in liters and the world data is in pounds. Also convert to liters.

In [45]:
# Function to convert between metric tons, thousand metric tons, million metric tons, and liters
def apply_unit_conversion(row):
    if (
        (row["Variable"] == "Annual Production|Nickel" or row["Variable"] == "Annual Production|Tin" or row["Variable"] == "Annual Production|Zinc")
        and row["Unit"] == "metric tons"
    ):
        row[10:] /= 1000  # divide the values by 1000
        row["Unit"] = "thousand metric tons"
    elif (
        (row["Variable"] == "Annual Production|Milk" or row["Variable"] == "Annual Production|Crude Oil")
        and row["Unit"] == "million metric tons"
    ):
        row[10:] *= 1000  # multiply the values by 1000
        row["Unit"] = "thousand metric tons"
    elif (
        row["Variable"] == "Annual Production|Ethanol"
        and row["Unit"] == "million pounds"
    ):
        row[10:] /= 1000000 # divide values by 1000000 to get pounds
        row[10:] *= 0.5747 # convert to liters
        row["Unit"] = "Liters"

    elif (
        row["Variable"] == "Annual Production|Liquefied Natural Gas Exports"
        and row["Unit"] == "billion cubic feet"
    ):
        row[10:] *= 0.021 ## multiply values by 0.021 to get to 1mtpa according to https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/pdfs/energy-economics/statistical-review/bp-stats-review-2022-approximate-conversion-factors.pdf
        row["Unit"] = "Mtpa"
    return row


In [46]:
# Apply function to nickel, milk, zinc, tin, ethanol production, and LNG exports
mask = (
    (df["Variable"] == "Annual Production|Nickel") 
    & (df["Unit"] == "metric tons")
) | (
    (df["Variable"] == "Annual Production|Milk") 
    & (df["Unit"] == "million metric tons")
) | (
    (df["Variable"] == "Annual Production|Zinc")
    & (df["Unit"] == 'metric tons')
) | (
    (df["Variable"] == "Annual Production|Crude Oil")
    & (df["Unit"] == "million metric tons")
) | (
    (df["Variable"] == "Annual Production|Tin")
    & (df["Unit"] == 'metric tons')
) | (
    (df["Variable"] == "Annual Production|Ethanol")
    & (df["Unit"] == 'million pounds')
) | (
    (df["Variable"] == "Annual Production|Liquefied Natural Gas Exports")
    & (df["Unit"] == 'billion cubic feet')
)


df.loc[mask] = df.loc[mask].apply(apply_unit_conversion, axis=1)

### Change Units in CSV files that are mismatched

In [47]:

# Change unit name for ASHPs
df.loc[df["Unit"] == "U.S. Manufacturers Shipments", "Unit"] = "Number of Shipments"

# Change unit name for Beer production
df.loc[(df["Variable"] == "Annual Production|Beer Production"), "Unit"] = "thousand hectolitres"

# Change unit name for Cane sugar production
df.loc[(df["Variable"] == "Annual Production|Cane Sugar Production"), "Unit"] = "thousand metric tons"


# Change unit name for Primary magnesium
df.loc[(df["Variable"] == "Annual Production|Primary Magnesium"), "Unit"] = "Short tons"

# Change unit name for Magnesium
df.loc[(df["Variable"] == "Annual Production|Magnesium"), "Unit"] = "million pounds"

# Change unit name for Railroad
df.loc[(df["Variable"] == "Cumulative Number of Units|Railroad Tracks"), "Unit"] = "kilometers"

# Change unit name for transistors per microprocessor chip
df.loc[(df["Variable"] == "Computing Capacity|Transistors per Microprocessor Chip"), "Unit"] = "Transistors per microprocessor"

# Change unit name for magnetic data storage
df.loc[(df["Variable"] == "Computing Capacity|Magnetic Data Storage"), "Unit"] = "Bits per dollar"

# Change unit name for Random Access Memory
df.loc[(df["Variable"] == "Computing Capacity|Random Access Memory"), "Unit"] = "Bits per dollar"

# Change unit name for Telegraph traffic
df.loc[df["Variable"] == "Number of Units|Telegraph Traffic", "Unit"] = "Number (millions)"

# Change unit name for postal traffic
df.loc[df["Variable"] == "Number of Units|Postal Traffic", "Unit"] = "Number (millions)"

# Change unit name for oil refining
df.loc[df["Variable"] == "Total Capacity|Oil Refining", "Unit"] = "thousand barrels"

#Change unit name for milk production
df.loc[df["Variable"] == "Annual Production|Milk", "Unit"] = "thousand metric tons"

df.loc[df["Variable"] == "Computing Capacity|Computing Growth", "Unit"] = "Calculations per second per $1,000"


### Ensure alignment of units and variable names in two files and ensure no duplicates
- Check for alignment of variable names and units

In [48]:
# Convert units to dictionary to ensure alignment of unit names
unit_dict = df.set_index('Variable')['Unit'].to_dict()


In [49]:
# Check alignment of units

# Check units from the YAML file (variable_descriptions)
for item in variable_descriptions:
    variable, data = list(item.items())[0]
    if variable in unit_dict:
        expected_unit = unit_dict[variable]

        unit = data.get('unit')
        if unit and unit.endswith(expected_unit):
            pass
        else:
            print(f"{variable}: Unit mismatch - expected {expected_unit} from CSV file, but got {unit} in YAML file.")



In [50]:
# Check missing variables in each file. 

# Check for missing variables in the YAML file
for item in variable_descriptions:
    variable, _ = list(item.items())[0]
    df_variables_lst = df['Variable'].tolist()
    if variable not in df_variables_lst:
        print(f"{variable}: Variable not found in the CSV file.")

# Check for missing variables in the CSV file
df_variables_lst = df['Variable'].unique()
df_variables_lst = df_variables_lst.tolist()
for variable in df_variables_lst:
    if variable not in [list(item.keys())[0] for item in variable_descriptions]:
        print(f"{variable}: Variable not found in the YAML file.")


Cumulative Total Capacity|Jet Aircrafts: Variable not found in the CSV file.
Cumulative Total Capacity|High-Speed Rail: Variable not found in the YAML file.
Cumulative Length|Submarine Cables: Variable not found in the YAML file.
Annual Production|Capillary DNA Sequencing: Variable not found in the YAML file.


In [51]:
## Check 

# Group the DataFrame by "Variable" and check if each group has a consistent unit
inconsistent_units = {}
grouped = df.groupby("Variable")
for variable, group in grouped:
    unique_units = group["Unit"].unique()
    if len(unique_units) > 1:
        inconsistent_units[variable] = unique_units

if len(inconsistent_units) > 0:
    print("Variables with inconsistent units:")
    for variable, units in inconsistent_units.items():
        print(f"{variable}: {', '.join(units)}")
else:
    print("All variables have consistent units.")

All variables have consistent units.


### Cut any time series with less than 10 points

##### Helper functions

In [52]:
#Find first year function given a dataframe and index (index is the country-technology pairs)

def find_first_year(df, index, numerical_columns):
    row_values = df.loc[index, numerical_columns] # get row at that index
    for year, value in row_values.items(): #year is the column value and the value is each cell

        if pd.notnull(value) and (value !=0): # Iterate to find first non-zero and non-NAN value in row
            return year
        
    return None #else return None


In [53]:
#Find first year function given a dataframe and index (index is the country-technology pairs)

def find_last_year(df, index, numerical_columns):

    row_values = df.loc[index, numerical_columns] # get row at that index
    
    for year, value in reversed(list(row_values.items())): #find last value by reversing the list of values
        if pd.notnull(value) and (value!=0):
            return year   
    return None

In [54]:
#Creates years and values of trimmed time series

def get_time_series(df, index, numerical_columns):
    first_year = find_first_year(df, index, numerical_columns)
    last_year = find_last_year(df, index, numerical_columns)
    if first_year is None or last_year is None:
        years = []
        values = []
    else:   
        trimmed_timeseries = df.loc[index, first_year:last_year]
        trimmed_timeseries = trimmed_timeseries.dropna()

        years = trimmed_timeseries.index.tolist()
        years = np.array(years, dtype = int)
        values = trimmed_timeseries.tolist()

    return years, values

#### Count time series with more than ten points, cut those with fewer than ten points

In [55]:
## Specify the time series columns 
numerical_columns = [str(year) for year in range(1700, 2026)]

In [56]:
indices_to_keep = []
# Iterate over each row by index
for index in df.index:
    # Get the time series for the current row
    years, values = get_time_series(df, index, numerical_columns)
    # Check if the number of non-NA numerical columns is at least 10
    if len(values) >= 10:
        indices_to_keep.append(index)

In [57]:
filtered_df = df.loc[indices_to_keep]


### Export dataframe as HATCH 1.5 Internal

In [58]:
## Cleaned HATCH 
filtered_df.to_csv('clean_HATCHv1.5_internal.csv', index = False)

### Drop non-public data and export as HATCH 1.5 Public

In [59]:
## Drop submarine cable data (not for public use)
filtered_df = filtered_df[~filtered_df["Variable"].str.contains("Submarine")]

In [60]:
## Cleaned HATCH 
filtered_df.to_csv('clean_HATCHv1.5_public.csv', index = False)