In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# don't use default na, this mistakenly converts 'NA' (iso code for Namibia) to a missing value
# I manually input all the default na vals except for 'NA' to prevent this
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
all_tech = pd.read_csv('all_tech_version 2.0.csv', keep_default_na=False, 
                       na_values=['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 
                                  '<NA>', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None', ''])
all_tech

Unnamed: 0,ID,Spatial Scale,Country Code,Country Name,Technology Name,Metric,Unit,Data Source,Long Technology Name,1700,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Lithium-Ion Battery Storage_Cumulative Rated P...,National,TG,Togo,Lithium-Ion Battery Storage,Cumulative Rated Power,kW,GESDB,Cumulative Rated Power|Lithium-Ion Battery Sto...,,...,,1.900000e+02,2.900000e+02,,,,,,,
1,Lithium-Ion Battery Storage_Cumulative Rated C...,National,TG,Togo,Lithium-Ion Battery Storage,Cumulative Rated Capacity,kWh,GESDB,Cumulative Rated Capacity|Lithium-Ion Battery ...,,...,,7.600000e+02,1.160000e+03,,,,,,,
2,Sensible Heat Storage_Cumulative Rated Power_ZA,National,ZA,South Africa,Sensible Heat Storage,Cumulative Rated Power,kW,GESDB,Cumulative Rated Power|Sensible Heat Storage,,...,2.550000e+05,3.550000e+05,4.550000e+05,,,,,,,
3,Sensible Heat Storage_Cumulative Rated Capacit...,National,ZA,South Africa,Sensible Heat Storage,Cumulative Rated Capacity,kWh,GESDB,Cumulative Rated Capacity|Sensible Heat Storage,,...,1.195000e+06,1.745000e+06,2.945000e+06,,,,,,,
4,Onshore Wind Energy_Levelized Cost of Energy_DK,National,DK,Denmark,Onshore Wind Energy,Levelized Cost of Energy,2022 USD/kWh,IRENA,Levelized Cost of Energy|Onshore Wind Energy,,...,5.929609e-02,4.898293e-02,4.679236e-02,0.048501,0.042769,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8638,Tin Production_Annual production_World,Global,World,World,Tin Production,Annual production,metric tons,USGS,Annual production|Tin Production,,...,5.226023e+05,5.767229e+05,5.462801e+05,500615.785555,,,,,,
8639,Objects Launched Into Space_Total Number_UA,National,UA,Ukraine,Objects Launched Into Space,Total Number,-,UNOOSA,Total Number|Objects Launched Into Space,,...,,,,,,,1.0,,,
8640,Objects Launched Into Space_Total Number_TM,National,TM,Turkmenistan,Objects Launched Into Space,Total Number,-,UNOOSA,Total Number|Objects Launched Into Space,,...,,,,,,,,,,
8641,Liquefied Natural Gas_Annual Production_US,National,US,United States,Liquefied Natural Gas,Annual Production,billion cubic feet,EIA,Annual Production|Liquefied Natural Gas,,...,1.868400e+02,7.075400e+02,1.083120e+03,1819.400000,2389.840000,3560.82,,,,


In [3]:
# check that all data points should be float or int, everything else is str
for col in all_tech.columns[9:]:
    for val in all_tech[col]:
        if type(val)!=float and type(val)!=int:
            print(col, val, type(val))

In [4]:
for col in all_tech.columns[:9]:
    for val in all_tech[col]:
        if type(val)!=str:
            print(col, val, type(val))

In [5]:
# checking for all unique IDs (technology/metric/country)
dup = []
for x in all_tech['ID']:
    if x not in dup:
        dup.append(x)
    else:
        print(x)

In [6]:
# DEU is for Western Germany
# (to distinguish from Germany before 1945 and after 1990, current iso2 code DE refers to both)
# KOR is for Korea (to distinguish from South Korea after 1950, current iso2 code KR refers to both)
# CSK is for Czechoslovakia (to distinguish from Czech Republic after 1992, CZ, and Serbia and Montenegro, CS)
# VNM is for South Vietnam (to distinguish from Vietnam from 1955-1975, now VN refers to both)
country_codes = list(set(all_tech['Country Code']))
for code in country_codes:
    if len(code) != 2:
        print(code)

Europe
KOR
Asia
World
DEU
Rest of World
VNM
North America
CSK


In [7]:
# checking that spatial scale is either Global or National
set(all_tech['Spatial Scale'])

{'Global', 'National'}

In [8]:
# checking that all rows characterized as Global have Country Code that is World
set(all_tech[all_tech['Spatial Scale']=='Global']['Country Code'])

{'Rest of World', 'World'}

In [9]:
# checking that all rows characterized as Global have Country Name that is World
set(all_tech[all_tech['Spatial Scale']=='Global']['Country Name'])

{'Rest of World', 'World'}

In [10]:
set(all_tech[all_tech['Country Name']=='World']['Country Code'])

{'World'}

In [11]:
set(all_tech[all_tech['Country Name']=='World']['Spatial Scale'])

{'Global'}

In [12]:
set(all_tech[all_tech['Country Code']=='World']['Spatial Scale'])

{'Global'}

In [13]:
set(all_tech[all_tech['Country Name']=='World']['Country Name'])

{'World'}

In [14]:
country_names = list(set(all_tech['Country Name']))

In [15]:
# check that country codes that map onto multiple country names are all referring to the same entity
# e.g., make sure North Korea and South Korea aren't mapped onto the same code
country_code_dict = {}
for code in country_codes:
    country_code_dict[code] = None
    
for code in country_code_dict:
    name_list = list(set(all_tech[all_tech['Country Code']==code]['Country Name']))
    country_code_dict[code] = name_list

for code in country_code_dict:
    if len(country_code_dict[code]) > 1:
        print(code, country_code_dict[code])

KG ['Kyrgyzstan', 'Kyrgistan', 'Kyrgyz Republic']
US ['United States', 'Boeing', 'USA', 'US', 'United States of America']
AE ['United Arab Emirates', 'UAE']
SU ['U.S.S.R.', 'USSR']
TR ['Turkey', 'Türkiye', 'Turkiye']
TT ['Trinidad and Tobago', 'Trinidad & Tobago']
BF ['Burkina Faso', 'Burkina-Faso']
FM ['Micronesia, Fed. Sts.', 'Micronesia (Federated States of)']
MM ['Burma', 'Myanmar']
LA ["Lao People's Democratic Republic", 'Laos', 'Lao PDR']
KN ['Saint Kitts and Nevis', 'St. Kitts and Nevis']
CZ ['Czechia', 'Czech Republic']
MF ['St. Martin (French part)', 'Saint Martin (French Part)']
NL ['Netherlands', 'Netherlands (Kingdom of the)']
SZ ['Swaziland', 'Eswatini']
EG ['Egypt', 'Egypt, Arab Rep.']
YE ['Yemen, Rep.', 'Yemen']
GM ['Gambia, The', 'Gambia']
LC ['St. Lucia', 'Saint Lucia']
CD ['Democratic Republic of Congo', 'Congo (Kinshasa)', 'Democratic Republic of the Congo', 'Congo, Dem. Rep.', 'Zaire']
VN ['Viet Nam', 'Vietnam']
VI ['United States Virgin Islands', 'Virgin Islands (U

In [16]:
# check that all countries map onto one code (i.e. 'Czechoslovakia' should not map onto both 'CZ' and 'CSK')
country_name_dict = {}
for country in country_names:
    country_name_dict[country] = None

for code in country_name_dict:
    code_list = list(set(all_tech[all_tech['Country Name']==code]['Country Code']))
    country_name_dict[code] = code_list

for country in country_name_dict:
    if len(country_name_dict[country]) > 1:
        print(country)

In [17]:
# check for monotonically increasing values where metric is cumulative
cum_idx = []
for idx in range(len(all_tech)):
    metric = all_tech.iloc[idx]['Metric']
    if 'Cumulative' in metric:
        cum_idx.append(idx)
        
for idx in cum_idx:
    row = all_tech.iloc[idx][9:]
    row.dropna(inplace=True)
    cum_sum = None
    for val in row:
        if cum_sum == None:
            cum_sum = val
            continue
        if val < cum_sum:
            tech = all_tech.iloc[idx]['Technology Name']
            country = all_tech.iloc[idx]['Country Name']
            print('error!', tech, country)
            break

In [18]:
# check for 0<x<1 where metric is share
share_idx = []
for idx in range(len(all_tech)):
    metric = all_tech.iloc[idx]['Metric']
    if 'Share' in metric:
        share_idx.append(idx)

for idx in share_idx:
    row = all_tech.iloc[idx][9:]
    row.dropna(inplace=True)
    for val in row:
        if val < 0 or val > 1:
            print('error!', tech, country)
            break