In [1]:
import pandas as pd
import numpy as np

# World Bank World Development Indicators
- <b>Goal</b>: Compile the dataset preparing for further regression analysis 
    - including the natural log term for per capita GDP
- This dataset includes 266 unique countries over 1960-2021, although some datapoints are empty.
    - NOTE: the coal and gad share in electricity data is missing from 2015, thus use our world in data csv files
- <b>Data citation</b>
    - World Bank Databank: https://databank.worldbank.org/reports.aspx?source=2&series=EG.ELC.NGAS.ZS&country=

In [3]:
WDI_dir = "../../data/global_WB_WorldDevelopmentIndicator/"

In [4]:
# load WDI dataset 
fpath = WDI_dir+"d16c9d02-d6b6-41d4-b97d-554c36ee3307_Data.csv"
all_dataset = pd.read_csv(fpath)
all_dataset = all_dataset.dropna(how="all").iloc[:-2,:]

In [5]:
# all the 
#all_dataset["Series Name"].unique() #'GDP per capita, PPP (constant 2017 international $)',
series = ['GDP per capita (constant 2015 US$)',
          'GDP per capita, PPP (constant 2017 international $)',
          'Population, total',
          'Energy use (kg of oil equivalent) per $1,000 GDP (constant 2017 PPP)',
          'Manufacturing, value added (% of GDP)',
          'Coal rents (% of GDP)',
          'Oil rents (% of GDP)', 
          'Natural gas rents (% of GDP)',
          'Electricity production from natural gas sources (% of total)',
          'Electricity production from coal sources (% of total)',
          'Electricity production from oil, gas and coal sources (% of total)']
name_list = ["GDPpc_2015$","GDPpc_2017$","Population","WDI_EnergyGDP_kgOilEq/$1k","WDI_Manu_GDP_%","WDI_CoalRents_%",
             "WDI_OilRents_%","WDI_NGRents_%","WDI_NG_El_%","WDI_Coal_El_%","WDI_Fossil_El_%"]

In [6]:
for series_select in series:
    name_select = name_list[series.index(series_select)]
    data_select = all_dataset[all_dataset["Series Name"]==series_select]
    data_select = data_select.iloc[:,3:]
    data_select_melt = pd.melt(data_select, id_vars=['Country Code'], value_vars=all_dataset.columns[4:],var_name='year_raw', value_name=name_select)
    data_select_melt[["year","bin"]] = data_select_melt["year_raw"].str.split(" ", expand = True)
    data_select_melt["year"] = data_select_melt["year"].astype(int)
    data_select_melt = data_select_melt.drop(columns=["bin","year_raw"])
    data_select_melt = data_select_melt.sort_values(by="year")
    data_select_melt = data_select_melt.set_index(["Country Code","year"])
    if series.index(series_select) == 0:
        data_compile = data_select_melt
    else:
        data_compile[name_select] = data_select_melt[name_select]
data_compile = data_compile.replace("..",np.nan)
data_compile = data_compile.astype(float)
data_compile["logGDPpc"] = np.log(data_compile["GDPpc_2017$"])
#data_compile.to_csv("_all_temporal_world_development.csv")
data_compile

Unnamed: 0_level_0,Unnamed: 1_level_0,GDPpc_2015$,GDPpc_2017$,Population,WDI_EnergyGDP_kgOilEq/$1k,WDI_Manu_GDP_%,WDI_CoalRents_%,WDI_OilRents_%,WDI_NGRents_%,WDI_NG_El_%,WDI_Coal_El_%,WDI_Fossil_El_%,logGDPpc
Country Code,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AFG,1960,,,8.622466e+06,,,,,,,,,
SYC,1960,3218.664448,,4.170000e+04,,,,,,,,,
SLE,1960,497.418491,,2.301310e+06,,,,,,,,,
SGP,1960,3611.953651,,1.646400e+06,,10.577547,,,,,,,
SXM,1960,,,2.646000e+03,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
IMN,2022,,,8.451900e+04,,,,,,,,,
ISR,2022,42594.318873,44272.253221,9.550600e+06,,,,,,,,,10.698113
ITA,2022,32902.665137,43788.239314,5.885685e+07,,14.123337,,,,,,,10.687121
GNB,2022,621.852226,1855.111973,2.105566e+06,,,,,,,,,7.525700


### Add Our World In Data

In [7]:
share_coal = pd.read_csv(WDI_dir+"share-electricity-coal.csv")
share_coal = share_coal[share_coal["Year"]>2015]
share_coal

Unnamed: 0,Entity,Code,Year,Coal (% electricity)
16,Afghanistan,AFG,2016,0.000000
17,Afghanistan,AFG,2017,0.000000
18,Afghanistan,AFG,2018,0.000000
19,Afghanistan,AFG,2019,0.000000
20,Afghanistan,AFG,2020,0.000000
...,...,...,...,...
6292,Zimbabwe,ZWE,2017,41.088436
6293,Zimbabwe,ZWE,2018,40.152340
6294,Zimbabwe,ZWE,2019,43.932040
6295,Zimbabwe,ZWE,2020,44.342106


In [8]:
data_compile_reset = data_compile.reset_index()
data_compile_younger = data_compile_reset[data_compile_reset["year"]>2015]
data_compile_older = data_compile_reset[data_compile_reset["year"]<=2015]
data_compile_younger = data_compile_younger.drop(columns=["WDI_Coal_El_%","WDI_NG_El_%"])
data_compile_plusCoalEl = pd.merge(data_compile_younger,share_coal,left_on=["Country Code","year"],right_on=["Code","Year"],how="inner")
data_compile_plusCoalEl = data_compile_plusCoalEl.rename(columns={"Coal (% electricity)":"WDI_Coal_El_%"})
data_compile_plusCoalEl = data_compile_plusCoalEl.drop(columns=["Entity","Code","Year"])
data_compile_plusCoalEl

Unnamed: 0,Country Code,year,GDPpc_2015$,GDPpc_2017$,Population,WDI_EnergyGDP_kgOilEq/$1k,WDI_Manu_GDP_%,WDI_CoalRents_%,WDI_OilRents_%,WDI_NGRents_%,WDI_Fossil_El_%,logGDPpc,WDI_Coal_El_%
0,ZAF,2016,6185.746047,13844.275963,56422274.0,,12.483644,1.803605,0.182132,0.015336,,9.535627,88.605446
1,KNA,2016,20813.774115,29329.554042,47788.0,,5.721434,0.000000,0.000000,0.000000,,10.286351,0.000000
2,LKA,2016,4174.637002,12770.842832,21425494.0,,16.018659,0.000000,0.000000,0.000000,,9.454920,28.634040
3,ESP,2016,26514.324812,38497.484916,46484062.0,,11.264963,0.001137,0.001329,0.000180,,10.558348,13.433386
4,SSD,2016,,,11066105.0,,,,,,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,DEU,2022,43032.142085,53560.091056,84079811.0,,18.450473,,,,,10.888559,31.046312
1221,GRC,2022,20167.558903,31516.636854,10566531.0,,8.806199,,,,,10.358271,10.411900
1222,HUN,2022,16288.987145,35254.503304,9683505.0,,17.181391,,,,,10.470349,8.164414
1223,IRL,2022,98561.624049,113870.785550,5086988.0,,37.555767,,,,,11.642820,7.254497


In [9]:
share_gas = pd.read_csv(WDI_dir+"share-electricity-gas.csv")
share_gas = share_gas[share_gas["Year"]>2015]
share_gas

Unnamed: 0,Entity,Code,Year,Gas (% electricity)
16,Afghanistan,AFG,2016,0.0
17,Afghanistan,AFG,2017,0.0
18,Afghanistan,AFG,2018,0.0
19,Afghanistan,AFG,2019,0.0
20,Afghanistan,AFG,2020,0.0
...,...,...,...,...
6292,Zimbabwe,ZWE,2017,0.0
6293,Zimbabwe,ZWE,2018,0.0
6294,Zimbabwe,ZWE,2019,0.0
6295,Zimbabwe,ZWE,2020,0.0


In [10]:
data_compile_plusGasEl = pd.merge(data_compile_plusCoalEl,share_gas,left_on=["Country Code","year"],right_on=["Code","Year"])
data_compile_plusGasEl = data_compile_plusGasEl.rename(columns={"Gas (% electricity)":"WDI_NG_El_%"})
data_compile_plusGasEl = data_compile_plusGasEl.drop(columns=["Entity","Code","Year"])
#data_compile_plusGasEl.to_csv("check.csv")
data_compile_plusGasEl

Unnamed: 0,Country Code,year,GDPpc_2015$,GDPpc_2017$,Population,WDI_EnergyGDP_kgOilEq/$1k,WDI_Manu_GDP_%,WDI_CoalRents_%,WDI_OilRents_%,WDI_NGRents_%,WDI_Fossil_El_%,logGDPpc,WDI_Coal_El_%,WDI_NG_El_%
0,ZAF,2016,6185.746047,13844.275963,56422274.0,,12.483644,1.803605,0.182132,0.015336,,9.535627,88.605446,0.000000
1,KNA,2016,20813.774115,29329.554042,47788.0,,5.721434,0.000000,0.000000,0.000000,,10.286351,0.000000,0.000000
2,LKA,2016,4174.637002,12770.842832,21425494.0,,16.018659,0.000000,0.000000,0.000000,,9.454920,28.634040,0.000000
3,ESP,2016,26514.324812,38497.484916,46484062.0,,11.264963,0.001137,0.001329,0.000180,,10.558348,13.433386,19.480806
4,SSD,2016,,,11066105.0,,,,,,,,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,DEU,2022,43032.142085,53560.091056,84079811.0,,18.450473,,,,,10.888559,31.046312,16.490566
1221,GRC,2022,20167.558903,31516.636854,10566531.0,,8.806199,,,,,10.358271,10.411900,37.299770
1222,HUN,2022,16288.987145,35254.503304,9683505.0,,17.181391,,,,,10.470349,8.164414,24.943691
1223,IRL,2022,98561.624049,113870.785550,5086988.0,,37.555767,,,,,11.642820,7.254497,48.569740


In [11]:
data_add_back = pd.concat([data_compile_older,data_compile_plusGasEl])
data_add_back  = data_add_back.set_index(["Country Code","year"])
data_add_back 

Unnamed: 0_level_0,Unnamed: 1_level_0,GDPpc_2015$,GDPpc_2017$,Population,WDI_EnergyGDP_kgOilEq/$1k,WDI_Manu_GDP_%,WDI_CoalRents_%,WDI_OilRents_%,WDI_NGRents_%,WDI_NG_El_%,WDI_Coal_El_%,WDI_Fossil_El_%,logGDPpc
Country Code,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AFG,1960,,,8622466.0,,,,,,,,,
SYC,1960,3218.664448,,41700.0,,,,,,,,,
SLE,1960,497.418491,,2301310.0,,,,,,,,,
SGP,1960,3611.953651,,1646400.0,,10.577547,,,,,,,
SXM,1960,,,2646.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
DEU,2022,43032.142085,53560.091056,84079811.0,,18.450473,,,,16.490566,31.046312,,10.888559
GRC,2022,20167.558903,31516.636854,10566531.0,,8.806199,,,,37.299770,10.411900,,10.358271
HUN,2022,16288.987145,35254.503304,9683505.0,,17.181391,,,,24.943691,8.164414,,10.470349
IRL,2022,98561.624049,113870.785550,5086988.0,,37.555767,,,,48.569740,7.254497,,11.642820


In [12]:
data_add_back.to_csv("_all_temporal_world_development.csv")