## Cleaning HATCH dataset for: 
- Alignment with IIAC conventions so that it can be displayed through the Scenario Explorer
- Alignment of variable names


In [328]:
# Import packages

import requests
import yaml
import pandas as pd
import pyam
from countrycode import countrycode

#### Import files from GitHub

Currently hosted in JENNA's repo

In [329]:
# YAML url
yaml_url = 'https://raw.githubusercontent.com/jennagreene22/HATCH_data/main/Tech_Growth_V1.5_variabledescriptions_Clean.yaml'

# Get YAML content using the requests package
yaml_response = requests.get(yaml_url)

# Check if the request was successful (status code 200)
if yaml_response.status_code == 200:
    # Parse the YAML content
    variable_descriptions = yaml.safe_load(yaml_response.text)

## Read in CSV file from Github
url = "https://raw.githubusercontent.com/jennagreene22/HATCH_data/main/all_tech_version_1.5_2024.csv"

df = pd.read_csv(url, index_col = None, low_memory = False)
df.drop(df.filter(regex='Unnamed').columns, axis=1, inplace=True)
df

Unnamed: 0,ID,Spatial Scale,Country Code,Country Name,Technology Name,Metric,Unit,Data Source,Long Technology Name,1700,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Lithium-Ion Battery Storage_Cumulative Rated P...,National,TG,Togo,Lithium-Ion Battery Storage,Cumulative Rated Power,kW,GESDB,Cumulative Rated Power|Lithium-Ion Battery Sto...,,...,,190.0,2.900000e+02,,,,,,,
1,Lithium-Ion Battery Storage_Cumulative Rated C...,National,TG,Togo,Lithium-Ion Battery Storage,Cumulative Rated Capacity,kWh,GESDB,Cumulative Rated Capacity|Lithium-Ion Battery ...,,...,,760.0,1.160000e+03,,,,,,,
2,Sensible Heat Storage_Cumulative Rated Power_ZA,National,ZA,South Africa,Sensible Heat Storage,Cumulative Rated Power,kW,GESDB,Cumulative Rated Power|Sensible Heat Storage,,...,255000.0,355000.0,4.550000e+05,,,,,,,
3,Sensible Heat Storage_Cumulative Rated Capacit...,National,ZA,South Africa,Sensible Heat Storage,Cumulative Rated Capacity,kWh,GESDB,Cumulative Rated Capacity|Sensible Heat Storage,,...,1195000.0,1745000.0,2.945000e+06,,,,,,,
4,Onshore Wind Energy_Levelized Cost of Energy_DK,National,DK,Denmark,Onshore Wind Energy,Levelized Cost of Energy,2022 USD/kWh,IRENA,Levelized Cost of Energy|Onshore Wind Energy,,...,0.0592960908939865,0.0489829260405466,4.679236e-02,0.0485005145689907,0.0427686155112111,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8077,Objects Launched Into Space_Total Number_UA,National,UA,Ukraine,Objects Launched Into Space,Total Number,-,UNOOSA,Total Number|Objects Launched Into Space,,...,,,,,,,1,,,
8078,Objects Launched Into Space_Total Number_TM,National,TM,Turkmenistan,Objects Launched Into Space,Total Number,-,UNOOSA,Total Number|Objects Launched Into Space,,...,,,,,,,,,,
8079,Liquefied Natural Gas_Annual Production_US,National,US,United States,Liquefied Natural Gas,Annual Production,billion cubic feet,EIA,Annual Production|Liquefied Natural Gas,,...,186.84,707.54,1.083120e+03,1819.4,2389.84,3560.82,,,,
8080,Objects Launched Into Space_Total Number_CO,National,CO,Colombia,Objects Launched Into Space,Total Number,-,UNOOSA,Total Number|Objects Launched Into Space,,...,,,1.000000e+00,,,,,,,


In [330]:
exclude_columns = ['ID', 'Spatial Scale', 'Country Code', 
'Country Name', 'Technology Name', 'Metric', 'Unit', 'Data Source', 'Long Technology Name' ]

# Identify columns to convert
columns_to_convert = [col for col in df.columns if col not in exclude_columns]

# Convert the identified columns to float
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')


#### Clean data by removing duplicates and non-1.5 data
- remove data on cost
- remove duplicate country x technology pairs

In [331]:
# Remove all cost data

df = df[~df['Metric'].str.contains('Cost', case=False, na=False)]


In [332]:
variable_descriptions

[{'Annual Production|Acrylic Fiber': {'description': 'Annual production of acrylic fiber',
   'unit': 'million pounds',
   'category': 'Chemicals and Industry'}},
 {'Annual Production|Acrylonitrile': {'description': 'Annual production of acrylonitrile',
   'unit': 'million pounds',
   'category': 'Chemicals and Industry'}},
 {'Annual Production|Air-Source Heat Pumps': {'description': 'Annual Production of Air Source Heat Pumps',
   'unit': '-',
   'category': 'Energy End-Use'}},
 {'Annual Production|All Biofuels': {'description': 'Annual production of biofuels across types',
   'unit': 'Petajoule',
   'category': 'Energy Supply'}},
 {'Annual Production|Ammonia Synthesis': {'description': 'Annual production of synthetic ammonia through ammonia synthesis (includes nitrogen fertilizer and synthetic ammonia production)',
   'unit': 'thousand tons of nitrogen equivalent',
   'category': 'Chemicals and Industry'}},
 {'Annual Production|Aniline': {'description': 'Annual production of aniline'

In [333]:
print(df['Unit'].unique())

['kW' 'kWh' '-' 'Mtpa' 'MW' 'metric tons' 'million pounds' 'net tons'
 'Gbase' '%' 'billion cubic feet' 'million' 'million instructions/second'
 'million metric tons' 'thousand barrels/day' 'billion cubic metres' 'TWh'
 'petajoules' 'kilometers' 'miles' 'GWh' 'bits/2022 USD' 'Liters'
 'calculations per second/thousand USD' 'thousand metric tons'
 'thousand hectolitres' 'thousand tons' 'million tons'
 'no. of transistors/chip' 'thousand tons of nitrogen equivalent'
 'million cubic meters/day' 'Hz' 'short tons' 'bits/second'
 'cumulative acres planted' 'kilobase' 'GW' 'billion barrels' 'MWt'
 'GB/month' 'billion pounds' 'GWe' 'million gallons']


In [334]:
# remove duplicates by selecting for the longer time series

def get_row_length(row):
    # Subtract 10 for the other rows
    return len([value for value in row[10:] if pd.notna(value)])

# Access row length for each row
df['RowLength'] = df.apply(get_row_length, axis=1)

# Sort the DataFrame by 'RowLength' in descending order
df = df.sort_values(by='RowLength', ascending=False)

# Drop duplicates based on 'Variable' and 'Region', keeping the one with the longer time series
df = df.drop_duplicates(subset=['Long Technology Name', 'Country Code'], keep='first')

# Remove the 'RowLength' column
df = df.drop('RowLength', axis=1)


  df['RowLength'] = df.apply(get_row_length, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RowLength'] = df.apply(get_row_length, axis=1)


In [335]:
# For zinc and nickel production, drop USGS values

tech_name_to_drop = "Zinc Production"
region_to_drop = "US"
source_to_drop = "USGS"

# Create condition where all three conditions above are met
condition = (df["Technology Name"] == tech_name_to_drop) & (df["Country Code"] == region_to_drop) & (df["Data Source"] == source_to_drop)

# Drop rows that meet those three conditions
df = df.loc[~condition]

# Repeat for nickel production
tech_name_to_drop = "Nickel"
region_to_drop = "US"
source_to_drop = "Mitchell"

# Create condition where all three conditions above are met
condition = (df["Technology Name"] == tech_name_to_drop) & (df["Country Code"] == region_to_drop) & (df["Data Source"] == source_to_drop)

# Drop rows that meet those three conditions
df = df.loc[~condition]


## Drop if the source is "Schmidt & Staffell" - these are cost data
source_to_drop = "Schmidt & Staffell"
condition = (df["Data Source"] == source_to_drop)
df = df.loc[~condition]

## Change Variable Names
 - align variable names in variable description file with v1.5 file
 - drop variables that should not be public
 - drop any variables with inconsistent units

In [336]:
df = df.rename(columns={"Country Code": "Region", "Long Technology Name": "Variable"})

In [337]:
# Change variable names that are broadly different
df["Variable"] = df["Variable"].str.replace('Annual production', 'Annual Production')
df["Variable"] = df["Variable"].str.replace('Share of Households', 'Share of Market')


In [338]:
# Drop global Crude Oil for inconsistent formatting
df = df[~((df['Variable'] == 'Annual Production|Crude Oil') & (df['Region'] == 'World'))]


##### Align variable names in two documents

In [339]:
replacement_var_names = {" Average Capacity of Unit Additions|Fluid Catalytic Cracking Refineries": "Average Capacity of Unit Additions|Fluid Catalytic Cracking Refineries",
                        "Annual Production|Milk Production": "Annual Production|Milk", 
                        "Annual Production|Beer Production": "Annual Production|Beer", 
                        "Annual Production|Nickel Production": "Annual Production|Nickel", 
                        "Annual Production|Oil Production": "Annual Production|Oil", 
                        "Annual Production|Primary Aluminum Production": "Annual Production|Primary Aluminum", 
                        "Annual Production|Raw Steel Production": "Annual Production|Raw Steel",
                        "Annual Production|Salt Production": "Annual Production|Salt",
                        "Annual Production|Aquaculture Production": "Annual Production|Aquaculture",
                        "Annual Production|Acrylic Fiber": "Annual Production|Acrylic Fiber",
                        "Annual Production|Acrylonitrile": "Annual Production|Acrylonitrile",
                        "Annual Production|Coal Production": "Annual Production|Coal",
                        "Annual Production|Nuclear Energy": "Annual Production|Electricity|Nuclear",
                        "Annual Production|Liquefied Natural Gas": "Annual Production|Liquefied Natural Gas Exports",
                        "Annual Production|Primary Bauxite Production":"Annual Production|Primary Bauxite",
                        "Annual Production|Shale Production": "Annual Production|Shale Oil",
                        "Annual Production|Artificial and Synthetic Fibers" : "Annual Production|Synthetic Filaments",
                        "Annual Production|Biofuels Production": "Annual Production|All Biofuels",
                        "Annual Production|Sugar Output": "Annual Production|Cane Sugar", 
                        "Annual Production|Caustic Soda Acid" : "Annual Production|Caustic Soda", 
                        "Annual Production|Crude Petroleum": "Annual Production|Crude Oil",
                        "Total Number|Shotgun Sanger DNA Sequencing": "Annual Production|Shotgun Sanger DNA Sequencing",
                        "Computing Capacity|Processor Performance":"Computing Capacity|Processing Performance",
                        "Computing Capacity|Transistors per Microprocessor": "Computing Capacity|Transistors per Microprocessor Chip",
                        "Cumulative Total Capacity|High-Speed Rail": "Cumulative Number of Units|High Speed Rail",
                        "Total Length|Natural Gas Pipeline": "Cumulative Number of Units|Natural Gas Pipelines",
                        "Total Length|Oil Pipeline": "Cumulative Number of Units|Oil Pipelines",
                        "Total Length|Public Roads":"Cumulative Number of Units|Public Roads",
                        "Total Length|Railroad": "Cumulative Number of Units|Railroad Tracks", 
                        "Cumulative Length|Railroad":"Cumulative Number of Units|Railroad Tracks",
                        "Total Number|Television":"Number of Units|Televisions",
                        "Cumulative Rated Capacity|Compressed Air Energy Storage":"Cumulative Rated Capacity|Electricity|Compressed Air Energy Storage",
                        "Cumulative Rated Capacity|Electro-Chemical Capacitor": "Cumulative Rated Capacity|Electricity|Electro-Chemical Capacitor",
                        "Cumulative Rated Capacity|Flow Battery Storage": "Cumulative Rated Capacity|Electricity|Flow Battery Storage",
                        "Cumulative Rated Capacity|Flywheel Battery Storage": "Cumulative Rated Capacity|Electricity|Flywheel Battery Storage",
                        "Cumulative Rated Capacity|Lead-Acid Battery Storage": "Cumulative Rated Capacity|Electricity|Lead-Acid Battery Storage",
                        "Cumulative Rated Capacity|Lithium-Ion Battery Storage": "Cumulative Rated Capacity|Electricity|Lithium-Ion Battery Storage",
                        "Cumulative Rated Capacity|Pumped Hydro Storage": "Cumulative Rated Capacity|Electricity|Pumped Hydro Storage",
                        "Cumulative Rated Capacity|Sodium-Based Battery Storage": "Cumulative Rated Capacity|Electricity|Sodium-Based Battery Storage",
                        "Cumulative Rated Capacity|Zinc-Based Battery": "Cumulative Rated Capacity|Electricity|Zinc-Based Battery Storage",
                        "Cumulative Rated Capacity|Nickel-Based Battery":"Cumulative Rated Capacity|Nickel-Based Battery Storage",
                        "Cumulative Rated Capacity|Heat Thermal Battery Storage":"Cumulative Rated Capacity|Thermal Energy Storage|Heat Thermal Battery Storage",
                        "Cumulative Rated Capacity|Latent Heat Storage": "Cumulative Rated Capacity|Thermal Energy Storage|Latent Heat Storage",
                        "Cumulative Rated Capacity|Sensible Heat Storage": "Cumulative Rated Capacity|Thermal Energy Storage|Sensible Heat Storage",
                        "Cumulative Rated Power|Compressed Air Energy Storage": "Cumulative Rated Power|Electricity|Compressed Air Energy Storage",
                        "Cumulative Rated Power|Electro-Chemical Capacitor":"Cumulative Rated Power|Electricity|Electro-Chemical Capacitor",
                        "Cumulative Rated Power|Flow Battery Storage":"Cumulative Rated Power|Electricity|Flow Battery Storage",
                        "Cumulative Rated Power|Flywheel Battery Storage":"Cumulative Rated Power|Electricity|Flywheel Battery Storage",
                        "Cumulative Rated Power|Lead-Acid Battery Storage":"Cumulative Rated Power|Electricity|Lead-Acid Battery Storage",
                        "Cumulative Rated Power|Lithium-Ion Battery Storage":"Cumulative Rated Power|Electricity|Lithium-Ion Battery Storage",
                        "Cumulative Rated Power|Pumped Hydro Storage":"Cumulative Rated Power|Electricity|Pumped Hydro Storage",
                        "Cumulative Rated Power|Sodium-Based Battery Storage": "Cumulative Rated Power|Electricity|Sodium-Based Battery Storage",
                        "Cumulative Rated Power|Zinc-Based Battery": "Cumulative Rated Power|Electricity|Zinc-Based Battery Storage",
                        "Cumulative Rated Power|Nickel-Based Battery":"Cumulative Rated Power|Nickel-Based Battery Storage",
                        "Cumulative Rated Power|Heat Thermal Battery Storage":"Cumulative Rated Power|Thermal Energy Storage|Heat Thermal Battery Storage",
                        "Cumulative Rated Power|Latent Heat Storage":"Cumulative Rated Power|Thermal Energy Storage|Latent Heat Storage",
                        "Cumulative Rated Power|Sensible Heat Storage": "Cumulative Rated Power|Thermal Energy Storage|Sensible Heat Storage",
                        "Cumulative Total Capacity|Compact Fluorescent Light Bulbs": "Cumulative Total Capacity|Compact Flourescent Light bulbs",
                        "Total Capacity|Desalination Capacity": "Cumulative Total Capacity|Desalination",
                        "Cumulative Total Capacity|Natural Gas Power": "Cumulative Total Capacity|Electricity|Natural Gas",
                        "Cumulative Total Capacity|Jet Aircraft": "Cumulative Total Capacity|Jet Aircrafts",
                        "Installed Capacity|Carbon Capture and Sequestration": "Installed Capacity|Carbon Capture & Sequestration",
                        "Installed electricity capacity|Biogas": "Installed Capacity|Electricity|Biogas",
                        "Installed Capacity|Concentrated Solar Power": "Installed Capacity|Electricity|Concentrated Solar",
                        "Installed electricity capacity|Geothermal Energy": "Installed Capacity|Electricity|Geothermal Energy",
                        "Installed electricity capacity|Liquid Biofuels": "Installed Capacity|Electricity|Liquid Biofuels", 
                        "Installed electricity capacity|Marine Energy": "Installed Capacity|Electricity|Marine Energy",
                        "Installed electricity capacity|Offshore Wind Energy": "Installed Capacity|Electricity|Offshore Wind",
                        "Installed electricity capacity|Onshore Wind Energy": "Installed Capacity|Electricity|Onshore Wind",
                        "Installed electricity capacity|Solar Photovoltaic": "Installed Capacity|Electricity|Solar Photovoltaic", 
                        "Installed electricity capacity|Solid Biofuels": "Installed Capacity|Electricity|Solid Biomass",
                        "Installed electricity capacity|Solar Thermal Energy": "Installed Capacity|Solar Thermal Energy",
                        "Total Length|Canals": "Number of Units|Canals",
                        "Total Number|Cellphones": "Number of Units|Cellphones",
                        "Total Number|Crop Harvester":"Number of Units|Crop Harvester",
                        "Total Number|Nuclear Weapons": "Number of Units|Nuclear Weapons",
                        "Total Number|Passenger Cars": "Number of Units|Passenger Cars",
                        "Total Number|Postal Traffic": "Number of Units|Postal Traffic",
                        "Total Number|Radio":"Number of Units|Radios",
                        "Total Number|Space Launches": "Number of Units|Space Launches",
                        "Total Number|Passenger Vehicles": "Number of Units|Passenger Cars", 
                        "Total Number|Steamships": "Number of Units|Steamships", 
                        "Total Number|Objects Launched Into Space": "Number of Units|Space Launches", 
                        "Total Capacity|Oil Refining Capacity":"Total Capacity|Oil Refining", 
                        ""
                        "Cumulative total capacity|Steamships": "Number of Units|Steamships",
                        "Total Number|Telegraph Traffic": "Number of Units|Telegraph Traffic",
                        "Total Number|Telephones": "Number of Units|Telephones",
                        "Share of Market|Cable TV":"Share of Market|Cable Television",
                        "Share of Market|Colour TV":"Share of Market|Colour Television",
                        "Share of Market|Disk Brakes": "Share of Market|Disc Brakes",
                        "Share of Market|Freezer": "Share of Market|Freezers",
                        "Share of Market|Households With Only Mobile Phones (No Landlines)":"Share of Market|Households with Only Mobile Phones",
                        "Share of Market|Iron": "Share of Market|Irons",
                        "Share of Boilers|Nox Pollution Controls (Boilers)": "Share of Market|Nox Pollution Control Technologies",
                        "Share of Market|Real-Time Gross Settlement Adoption":"Share of Market|Real-time Gross Settlement",
                        "Share of Market|Stove": "Share of Market|Stoves",
                        "Share of Market|Videocassette Recorder": "Share of Market|Videocassette recorder",
                        "Share of Population|DTP1 Vaccine": "Share of Population|Diphtheria and Tetanus Vaccine First Dose",
                        "Share of Population|DTP3 Vaccine": "Share of Population|Diphtheria and Tetanus Vaccine Third Dose",
                        "Share of Population|ROTAC Vaccine": "Share of Population|Rotavirus Vaccine",
                        "Share of Population|BCG Vaccine": "Share of Population|Tuberculosis Vaccine", 
                        "Share of Population|RCV1 Vaccine": "Share of Population|Rubella Vaccine First Dose",
                        "Share of Population|YFV Vaccine": "Share of Population|Yellow Fever Vaccine", 
                        "Share of Population|POL3 Vaccine": "Share of Population|Polio Vaccine Third Dose", 
                        "Share of Population|PCV3 Vaccine": "Share of Population|Pneumococcal Conjugate Vaccine", 
                        "Share of Population|MCV1 Vaccine": "Share of Population|Measles Vaccine First Dose",
                        "Share of Population|MCV2 Vaccine": "Share of Population|Measles Vaccine Second Dose",
                        "Share of Population|IPV1 Vaccine": "Share of Population|Inactivated Polio Vaccine", 
                        "Share of Population|HEPB3 Vaccine": "Share of Population|Hepatitis B Vaccine Third Dose", 
                        "Share of Population|HEPBB Vaccine": "Share of Population|Hepatitis B Vaccine First Dose",
                        "Share of Population|HIB3 Vaccine": "Share of Population|Haemophilus Influenzae Vaccine Third Dose", 
                        "Cumulative Total Capacity|Oil Refining Capacity": "Total Capacity|Oil Refining",
                        "Annual Production|BisphenolA": "Annual Production|Bisphenol A",
                        "Annual Production|Cane Sugar Production": "Annual Production|Cane Sugar",
                        "Annual Production|Capture Fisheries Production": "Annual Production|Capture Fisheries",
                        "Annual Production|Cement Production": "Annual Production|Cement",
                        "Annual Production|Cobalt Mine Production": "Annual Production|Cobalt",
                        "Annual Production|Construction|Sand and Gravel Construction": "Annual Production|Sand and Gravel|Construction",
                        "Annual Production|Copper Mining": "Annual Production|Copper|Mining",
                        "Annual Production|Copper Refining": "Annual Production|Copper|Refining",
                        "Annual Production|Ethyl Alcohol": "Annual Production|Ethanol",
                        "Annual Production|Gold Production": "Annual Production|Gold",
                        "Annual Production|Graphite Mine Production": "Annual Production|Graphite",
                        "Annual Production|Industrial|Sand and Gravel Industrial": "Annual Production|Sand and Gravel|Industrial",
                        "Annual Production|Lead Mines": "Annual Production|Lead",
                        "Annual Production|Primary Copper Production": "Annual Production|Primary Copper",
                        "Annual Production|Zinc Production": "Annual Production|Zinc", 
                        "Annual Production|Tin Production": "Annual Production|Tin", 
                        "Annual Production|Silver Production": "Annual Production|Silver"}

df['Variable'] = df['Variable'].replace(replacement_var_names)

### Change Unit Names 
- per Daniel's recommendation


In [340]:
rename_units = {
    "Yearly Production (Mil. lbs)": "million pounds",
    "Yearly  Production (Billion Pounds)": "billion pounds",
    "In Thousands Of Hectolitres": "thousand hectolitres",
    "In Thousand Hectolitres": "thousand hectolitres",
    "Petajoules": "Petajoule",
    "petajoules": "Petajoule",
    "Thousand tons nitrogen equivalent": "thousand tons of nitrogen equivalent",
    "Metric tons": "metric tons",
    "Thousands Metric Tons": "thousand metric tons",
    "Thousand Metric Tons": "thousand metric tons",
    "Yearly Production (Million Gallons)": "million gallons",
    "Million tonnes": "million metric tons",
    "Million Pounds": "million pounds",
    "Million pounds": "million pounds",
    "Thousand tonnes": "thousand metric tons",
    "In Thousand Metric Tons": "thousand metric tons",
    "Tonnes": "metric tons",
    "Metric Tons": "metric tons",
    "Terawatt-hours": "TWh",
    "Miles": "miles",
    "Kilometers": "kilometer",
    "Twh": "TWh",
    "Percentage": "%",
    "Share of acres planted": "%",
    "Number in Use": "-",
    "cumulative acres planted": "Acres",
    "million instructions/second": "Million instructions per second (MIPS)",
    "GB/month": "Gigabytes per month",
    "calculations per second/1000 USD": "Calculations per second per $1,000",
    "bits/second": "Bits/second",
    "million cubic meters/day": "Million cubic meters per day",
    "billion cubic metres": "billion cubic meters"
}

# Apply the rename_units dictionary
df['Unit'] = df['Unit'].replace(rename_units)

## Fixing Units
For milk production, Mitchell data is inconsistent by the country. This is the standardization.

For nickel production, Mitchell and USGS data is inconsistent.

In [341]:
# Function to convert between metric tons, thousand metric tons, and million metric tons
def apply_unit_conversion(row):
    if (
        (row["Variable"] == "Annual Production|Nickel" or row["Variable"] == "Annual Production|Tin" or row["Variable"] == "Annual Production|Zinc")
        and row["Unit"] == "metric tons"
    ):
        row[10:] /= 1000  # divide the values by 1000
        row["Unit"] = "thousand metric tons"
    elif (
        (row["Variable"] == "Annual Production|Milk" or row["Variable"] == "Annual Production|Crude Oil")
        and row["Unit"] == "million metric tons"
    ):
        row[10:] *= 1000  # multiply the values by 1000
        row["Unit"] = "thousand metric tons"
    return row


In [342]:
# Apply function to nickel, milk, zinc, and tin production
mask = (
    (df["Variable"] == "Annual Production|Nickel") 
    & (df["Unit"] == "metric tons")
) | (
    (df["Variable"] == "Annual Production|Milk") 
    & (df["Unit"] == "million metric tons")
) | (
    (df["Variable"] == "Annual Production|Zinc")
    & (df["Unit"] == 'metric tons')
) | (
    (df["Variable"] == "Annual Production|Crude Oil")
    & (df["Unit"] == "million metric tons")
) | (
    (df["Variable"] == "Annual Production|Tin")
    & (df["Unit"] == 'metric tons')
)


df.loc[mask] = df.loc[mask].apply(apply_unit_conversion, axis=1)

### Change Units in CSV files that are mismatched

In [343]:

# Change unit name for ASHPs
df.loc[df["Unit"] == "U.S. Manufacturers Shipments", "Unit"] = "Number of Shipments"

# Change unit name for Beer production
df.loc[(df["Variable"] == "Annual Production|Beer Production"), "Unit"] = "thousand hectolitres"

# Change unit name for Cane sugar production
df.loc[(df["Variable"] == "Annual Production|Cane Sugar Production"), "Unit"] = "thousand metric tons"


# Change unit name for Primary magnesium
df.loc[(df["Variable"] == "Annual Production|Primary Magnesium"), "Unit"] = "Short tons"

# Change unit name for Magnesium
df.loc[(df["Variable"] == "Annual Production|Magnesium"), "Unit"] = "million pounds"

# Change unit name for Railroad
df.loc[(df["Variable"] == "Cumulative Number of Units|Railroad Tracks"), "Unit"] = "kilometers"

# Change unit name for transistors per microprocessor chip
df.loc[(df["Variable"] == "Computing Capacity|Transistors per Microprocessor Chip"), "Unit"] = "Transistors per microprocessor"

# Change unit name for magnetic data storage
df.loc[(df["Variable"] == "Computing Capacity|Magnetic Data Storage"), "Unit"] = "Bits per dollar"


# Change unit name for Random Access Memory
df.loc[(df["Variable"] == "Computing Capacity|Random Access Memory"), "Unit"] = "Bits per dollar"

# Change unit name for Telegraph traffic
df.loc[df["Variable"] == "Number of Units|Telegraph Traffic", "Unit"] = "Number (millions)"

# Change unit name for postal traffic
df.loc[df["Variable"] == "Number of Units|Postal Traffic", "Unit"] = "Number (millions)"

# Change unit name for oil refining
df.loc[df["Variable"] == "Total Capacity|Oil Refining", "Unit"] = "thousand barrels"

#Change unit name for milk production
df.loc[df["Variable"] == "Annual Production|Milk", "Unit"] = "thousand metric tons"

df.loc[df["Variable"] == "Computing Capacity|Computing Growth", "Unit"] = "Calculations per second per $1,000"


#### Ensure alignment of units and variable names in two files and ensure no duplicates
- final cleaning

In [344]:
# Convert units to dictionary to ensure alignment of unit names
unit_dict = df.set_index('Variable')['Unit'].to_dict()
unit_dict

{'Annual Production|Cane Sugar': 'thousand metric tons',
 'Annual Production|Beer': 'thousand hectolitres',
 'Cumulative Number of Units|Railroad Tracks': 'kilometers',
 'Number of Units|Radios': '-',
 'Number of Units|Telegraph Traffic': 'Number (millions)',
 'Number of Units|Postal Traffic': 'Number (millions)',
 'Annual Production|Gold': 'metric tons',
 'Annual Production|Crude Oil': 'thousand metric tons',
 'Annual Production|Zinc': 'thousand metric tons',
 'Cumulative Total Capacity|Bicycles': 'MW',
 'Number of Units|Steamships': '-',
 'Annual Production|Silver': 'metric tons',
 'Annual Production|Sulphuric Acid': 'thousand metric tons',
 'Cumulative Number of Units|Public Roads': 'miles',
 'Annual Production|Primary Bauxite': 'metric tons',
 'Annual Production|Cadmium Refining': 'metric tons',
 'Annual Production|Nickel': 'thousand metric tons',
 'Annual Production|Primary Aluminum': 'metric tons',
 'Annual Production|Cement': 'metric tons',
 'Annual Production|Rare Earth Mine Pr

In [345]:
data_type = df['Variable'].dtype
print(data_type)

object


In [346]:
# Check alignment of units

# Check units from the YAML file
for item in variable_descriptions:
    variable, data = list(item.items())[0]
    if variable in unit_dict:
        expected_unit = unit_dict[variable]
        unit = data.get('unit')
        unit
        if unit and unit.endswith(expected_unit):
            print(f"{variable}: Unit is consistent ({unit}).")
            pass
        else:
            print(f"{variable}: Unit mismatch - expected {expected_unit} from CSV file, but got {unit} in YAML file.")



Annual Production|Acrylic Fiber: Unit is consistent (million pounds).
Annual Production|Acrylonitrile: Unit is consistent (million pounds).
Annual Production|Air-Source Heat Pumps: Unit is consistent (-).
Annual Production|All Biofuels: Unit is consistent (Petajoule).
Annual Production|Ammonia Synthesis: Unit is consistent (thousand tons of nitrogen equivalent).
Annual Production|Aniline: Unit is consistent (million pounds).
Annual Production|Aquaculture: Unit is consistent (metric tons).
Annual Production|Beer: Unit is consistent (thousand hectolitres).
Annual Production|Benzene: Unit is consistent (million gallons).
Annual Production|Bisphenol A: Unit is consistent (million pounds).
Annual Production|Cadmium Refining: Unit is consistent (metric tons).
Annual Production|Cane Sugar: Unit is consistent (thousand metric tons).
Annual Production|Caprolactam: Unit is consistent (million pounds).
Annual Production|Capture Fisheries: Unit is consistent (metric tons).
Annual Production|Causti

In [347]:
# Check missing variables in each file. 


# Check for missing variables in the YAML file
for item in variable_descriptions:
    variable, _ = list(item.items())[0]
    df_variables_lst = df['Variable'].tolist()
    if variable not in df_variables_lst:
        print(f"{variable}: Variable not found in the CSV file.")

        
# Check for missing variables in the CSV file
df_variables_lst = df['Variable'].unique()
df_variables_lst = df_variables_lst.tolist()
for variable in df_variables_lst:
    if variable not in [list(item.keys())[0] for item in variable_descriptions]:
        print(f"{variable}: Variable not found in the YAML file.")


Number of Units|Jet Aircrafts: Variable not found in the CSV file.
Cumulative Length|Submarine Cables: Variable not found in the YAML file.
Number of Units|Jet Aircraft: Variable not found in the YAML file.
Annual Production|Capillary DNA Sequencing: Variable not found in the YAML file.


In [348]:
## Check 

# Group the DataFrame by "Variable" and check if each group has a consistent unit
inconsistent_units = {}
grouped = df.groupby("Variable")
for variable, group in grouped:
    unique_units = group["Unit"].unique()
    if len(unique_units) > 1:
        inconsistent_units[variable] = unique_units

if len(inconsistent_units) > 0:
    print("Variables with inconsistent units:")
    for variable, units in inconsistent_units.items():
        print(f"{variable}: {', '.join(units)}")
else:
    print("All variables have consistent units.")

Variables with inconsistent units:
Annual Production|Ethanol: Liters, million pounds
Annual Production|Liquefied Natural Gas Exports: Mtpa, billion cubic feet


In [349]:


## Cleaned HATCH 
df.to_csv('clean_HATCHv1.5_internal.csv', index = False)

KeyboardInterrupt: 

In [None]:
## Drop submarine cable data (not for public use)
df = df[~df["Variable"].str.contains("Submarine")]

In [None]:


## Cleaned HATCH 
df.to_csv('clean_HATCHv1.5_public.csv', index = False)

## Save IIAC aligned CSV 

#### Align country codes and variables names with IIAC conventions

In [None]:
def get_iso3c(region):
    
    if region == "World":
    
        return "World"
    else: 
        
        # Change iso2c code to iso3c code
        code = countrycode(region, 
                           origin = "iso2c", 
                           destination = "iso3c")
        return code

In [None]:
# Apply iso3c function

df["Region"] = df["Region"].astype(str).apply(get_iso3c)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Region"] = df["Region"].astype(str).apply(get_iso3c)


In [None]:
# Drop any region without a standard iso3c name (West Germany, East Germany, South Vietnam)
df = df[df["Region"].notna()]


In [None]:
## remove a list of columns
columns_to_remove = ["ID", "Spatial Scale", "Country Name", "Metric", "Data Source", "Technology Name"]
df = df.drop(columns=columns_to_remove)



In [None]:
## Then add the IIAC columns
df.insert(0, "Model", "Technology Indicators")
df.insert(1, "Scenario", "Release 2023.1.5")


In [None]:
columns_to_keep_as_strings = ['Model', 'Scenario', 'Region', 'Unit', 'Variable']
columns_to_convert_to_numeric = [col for col in df.columns if col not in columns_to_keep_as_strings]
df[columns_to_convert_to_numeric] = df[columns_to_convert_to_numeric].apply(pd.to_numeric, errors='coerce')

In [None]:


df.to_csv('Tech_Growth_V1.5_IIAC_Clean.csv', index = False)

KeyboardInterrupt: 

In [None]:


if 'unnamed: 0' in df.columns:
    # Drop the 'unnamed: 0' column
    df.drop(columns=['Unnamed:  0'], inplace=True)
    


df

Unnamed: 0,Model,Scenario,Region,Unit,Variable,1700,1701,1702,1703,1704,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
3072,Technology Indicators,Release 2023.1.5,JAM,thousand metric tons,Annual Production|Cane Sugar,,,,,,...,,,,,,,,,,
3160,Technology Indicators,Release 2023.1.5,BRB,thousand metric tons,Annual Production|Cane Sugar,,,,,,...,,,,,,,,,,
2906,Technology Indicators,Release 2023.1.5,CUB,thousand metric tons,Annual Production|Cane Sugar,,,,,,...,,,,,,,,,,
3147,Technology Indicators,Release 2023.1.5,GBR,thousand hectolitres,Annual Production|Beer,,,,,,...,,,,,,,,,,
2981,Technology Indicators,Release 2023.1.5,MUS,thousand metric tons,Annual Production|Cane Sugar,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5576,Technology Indicators,Release 2023.1.5,GNQ,%,Share of Population|Measles Vaccine Second Dose,,,,,,...,,,,,,0.17,,,,
5563,Technology Indicators,Release 2023.1.5,CIV,%,Share of Population|Measles Vaccine Second Dose,,,,,,...,,,,,,0.01,,,,
5559,Technology Indicators,Release 2023.1.5,COM,%,Share of Population|Measles Vaccine Second Dose,,,,,,...,,,,,,0.19,,,,
2830,Technology Indicators,Release 2023.1.5,BRA,kW,Cumulative Rated Power|Electricity|Lead-Acid B...,,,,,,...,,,,,392.0,,,,,


#### Test for issues with IIAC format

In [None]:
test = pyam.IamDataFrame("Tech_Growth_V1.5_IIAC_Clean.csv")


pyam - INFO: Running in a notebook, setting up a basic logging at level INFO
pyam.core - INFO: Reading file Tech_Growth_V1.5_IIAC_Clean.csv
  index = pd.unique(index)


In [None]:
test

<class 'pyam.core.IamDataFrame'>
Index:
 * model    : Technology Indicators (1)
 * scenario : Release 2023.1.5 (1)
Timeseries data coordinates:
   region   : ABW, AFG, AGO, AIA, ALB, AND, ARE, ARG, ARM, ASM, ... ZWE (232)
   variable : Annual Production|Acrylic Fiber, ... (225)
   unit     : %, -, Acres, Bits per dollar, Bits/second, ... thousand tons of nitrogen equivalent (41)
   year     : 1720, 1730, 1740, 1750, 1751, 1752, 1753, 1754, ... 2023 (277)
Meta indicators:
   exclude (bool) False (1)

In [None]:
test.timeseries()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,1720,1730,1740,1750,1751,1752,1753,1754,1755,1756,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
model,scenario,region,variable,unit,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Technology Indicators,Release 2023.1.5,ABW,Annual Production|Aquaculture,metric tons,,,,,,,,,,,...,0.00,0.000000,0.000000,0.50,1.00,1.50,1.50,,,
Technology Indicators,Release 2023.1.5,ABW,Annual Production|Capture Fisheries,metric tons,,,,,,,,,,,...,152.00,150.000000,150.000000,149.00,149.00,163.00,155.00,,,
Technology Indicators,Release 2023.1.5,ABW,Installed Capacity|Electricity|Onshore Wind,MW,,,,,,,,,,,...,30.00,30.000000,30.000000,30.00,30.00,30.00,30.00,30.00,30.0,
Technology Indicators,Release 2023.1.5,ABW,Installed Capacity|Electricity|Solar Photovoltaic,MW,,,,,,,,,,,...,4.90,6.100000,6.100000,6.10,6.10,6.10,6.10,6.10,,
Technology Indicators,Release 2023.1.5,ABW,Share of Market|Household Internet Access,%,,,,,,,,,,,...,83.78,88.661227,93.542454,97.17,,,,,,
Technology Indicators,Release 2023.1.5,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Technology Indicators,Release 2023.1.5,ZWE,Share of Population|Pneumococcal Conjugate Vaccine,%,,,,,,,,,,,...,0.91,0.870000,0.900000,0.89,0.89,0.90,0.86,0.86,,
Technology Indicators,Release 2023.1.5,ZWE,Share of Population|Polio Vaccine Third Dose,%,,,,,,,,,,,...,0.92,0.880000,0.900000,0.89,0.89,0.90,0.86,0.86,,
Technology Indicators,Release 2023.1.5,ZWE,Share of Population|Rotavirus Vaccine,%,,,,,,,,,,,...,0.48,0.870000,0.910000,0.91,0.90,0.92,0.88,0.88,,
Technology Indicators,Release 2023.1.5,ZWE,Share of Population|Rubella Vaccine First Dose,%,,,,,,,,,,,...,,,0.950000,0.90,0.88,0.85,0.85,0.85,,
