In [1]:
import pandas as pd
import numpy as np
import re


In [2]:
metadata = pd.DataFrame(columns=["Indicador",
		"codigo_indicador",
		"anio_inicio",
		"anio_final"])
def trata_csv(file_path, indicador, first=False, skiprows=0):
	dataframe = pd.read_csv(file_path, skiprows=skiprows)
	dataframe = dataframe.dropna(how="all", axis=1)
	columns = dataframe.columns[dataframe.apply(lambda col: col.dtype == "float64")]
	dataframe[indicador] = dataframe[columns].dropna(how="all").mean(axis=1)

	max_ = max(columns, key=lambda x: int(x))
	min_ = min(columns, key=lambda x: int(x))

	metadata.loc[len(metadata)] = {
		"Indicador" : indicador,
		"codigo_indicador" : dataframe.iloc[1]["Indicator Code"],
		"anio_inicio" : min_,
		"anio_final" : max_
	}

	if not first:
		dataframe = dataframe[["Country Code", indicador]]
		dataframe = dataframe.rename(columns={"Country Code" : "Country_Code"})
	else:
		dataframe = dataframe[['Country Name', "Country Code", indicador]]
		dataframe = dataframe.rename(columns={'Country Name' : 'Country_Name', "Country Code" : "Country_Code"})

	return dataframe

In [3]:
# Access to electricity

df_electr = trata_csv("data/WBG_data/access_to_electricity.csv", "Access_to_electricity_media", skiprows=4)
df_electr

EG.ELC.ACCS.ZS <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media
0,ABW,99.472445
1,AFE,31.756021
2,AFG,57.091304
3,AFW,41.742255
4,AGO,36.286957
...,...,...
261,XKX,100.000000
262,YEM,57.514358
263,ZAF,80.067227
264,ZMB,25.470565


In [4]:
# Agriculture

df_agricul = trata_csv("data/WBG_data/agriculture.csv", "AFFVA_media", skiprows=4)
df = df_electr.merge(df_agricul, how="inner", on="Country_Code")
df

NV.AGR.TOTL.ZS <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media
0,ABW,99.472445,0.103728
1,AFE,31.756021,13.177775
2,AFG,57.091304,27.952003
3,AFW,41.742255,22.446226
4,AGO,36.286957,7.883305
...,...,...,...
261,XKX,100.000000,7.986133
262,YEM,57.514358,18.149629
263,ZAF,80.067227,4.320056
264,ZMB,25.470565,12.357125


In [5]:
# Corrupción control

# df_corruption = pd.read_csv("data/WBG_data/corruption_control.csv", skipfooter=5)[["Country Code","2022 [YR2022]"]]
# df_corruption = df_corruption.rename(columns={"Country Code" : "Country_Code", "2022 [YR2022]" : "Control_of_Corruption"})
# df = df.merge(df_corruption, how="inner", on="Country_Code")
# df

df_corruption = trata_csv("data/WBG_data/control_corruption.csv", "Control_of_Corruption_media", skiprows=4)
df = df.merge(df_corruption, how="inner", on="Country_Code")
df

CC.EST <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media,Control_of_Corruption_media
0,ABW,99.472445,0.103728,1.139387
1,AFE,31.756021,13.177775,
2,AFG,57.091304,27.952003,-1.420090
3,AFW,41.742255,22.446226,
4,AGO,36.286957,7.883305,-1.233098
...,...,...,...,...
261,XKX,100.000000,7.986133,-0.444194
262,YEM,57.514358,18.149629,-1.245655
263,ZAF,80.067227,4.320056,0.130031
264,ZMB,25.470565,12.357125,-0.564180


In [6]:
df_education = trata_csv("data/WBG_data/expenditure_education.csv", "Expenditure_on_education_media", skiprows=4)
df = df.merge(df_education, how="inner", on="Country_Code")
df

SE.XPD.TOTL.GD.ZS <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media,Control_of_Corruption_media,Expenditure_on_education_media
0,ABW,99.472445,0.103728,1.139387,5.325764
1,AFE,31.756021,13.177775,,4.410762
2,AFG,57.091304,27.952003,-1.420090,2.956045
3,AFW,41.742255,22.446226,,2.843056
4,AGO,36.286957,7.883305,-1.233098,3.034522
...,...,...,...,...,...
261,XKX,100.000000,7.986133,-0.444194,
262,YEM,57.514358,18.149629,-1.245655,6.097335
263,ZAF,80.067227,4.320056,0.130031,5.182884
264,ZMB,25.470565,12.357125,-0.564180,3.718272


In [7]:
df_gdp = trata_csv("data/WBG_data/GDP_per_capita.csv", "GDP_per_capita_media", skiprows=4)
df = df.merge(df_gdp, how="inner", on="Country_Code")
df

NY.GDP.PCAP.CD <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media,Control_of_Corruption_media,Expenditure_on_education_media,GDP_per_capita_media
0,ABW,99.472445,0.103728,1.139387,5.325764,21735.275013
1,AFE,31.756021,13.177775,,4.410762,822.978192
2,AFG,57.091304,27.952003,-1.420090,2.956045,416.667609
3,AFW,41.742255,22.446226,,2.843056,881.552220
4,AGO,36.286957,7.883305,-1.233098,3.034522,2133.920034
...,...,...,...,...,...,...
261,XKX,100.000000,7.986133,-0.444194,,4005.794796
262,YEM,57.514358,18.149629,-1.245655,6.097335,903.833244
263,ZAF,80.067227,4.320056,0.130031,5.182884,3602.799975
264,ZMB,25.470565,12.357125,-0.564180,3.718272,698.678735


In [8]:
df_gdp_ppp = trata_csv("data/WBG_data/GDP_per_capita_PPP.csv", "GDP_per_capitaPPP_media", skiprows=4)
df = df.merge(df_gdp_ppp, how="inner", on="Country_Code")
df

NY.GDP.PCAP.PP.CD <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media,Control_of_Corruption_media,Expenditure_on_education_media,GDP_per_capita_media,GDP_per_capitaPPP_media
0,ABW,99.472445,0.103728,1.139387,5.325764,21735.275013,33223.072570
1,AFE,31.756021,13.177775,,4.410762,822.978192,2830.148059
2,AFG,57.091304,27.952003,-1.420090,2.956045,416.667609,1715.017152
3,AFW,41.742255,22.446226,,2.843056,881.552220,3104.291490
4,AGO,36.286957,7.883305,-1.233098,3.034522,2133.920034,5274.217573
...,...,...,...,...,...,...,...
261,XKX,100.000000,7.986133,-0.444194,,4005.794796,9510.311782
262,YEM,57.514358,18.149629,-1.245655,6.097335,903.833244,2859.162749
263,ZAF,80.067227,4.320056,0.130031,5.182884,3602.799975,10784.240237
264,ZMB,25.470565,12.357125,-0.564180,3.718272,698.678735,2546.693527


In [9]:
df_pop = trata_csv("data/WBG_data/population.csv", "population_media", skiprows=4)
df = df.merge(df_pop, how="inner", on="Country_Code")
df

SP.POP.TOTL <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media,Control_of_Corruption_media,Expenditure_on_education_media,GDP_per_capita_media,GDP_per_capitaPPP_media,population_media
0,ABW,99.472445,0.103728,1.139387,5.325764,21735.275013,33223.072570,7.790650e+04
1,AFE,31.756021,13.177775,,4.410762,822.978192,2830.148059,3.579699e+08
2,AFG,57.091304,27.952003,-1.420090,2.956045,416.667609,1715.017152,1.878244e+07
3,AFW,41.742255,22.446226,,2.843056,881.552220,3104.291490,2.438662e+08
4,AGO,36.286957,7.883305,-1.233098,3.034522,2133.920034,5274.217573,1.518896e+07
...,...,...,...,...,...,...,...,...
261,XKX,100.000000,7.986133,-0.444194,,4005.794796,9510.311782,1.664839e+06
262,YEM,57.514358,18.149629,-1.245655,6.097335,903.833244,2859.162749,1.612413e+07
263,ZAF,80.067227,4.320056,0.130031,5.182884,3602.799975,10784.240237,3.894634e+07
264,ZMB,25.470565,12.357125,-0.564180,3.718272,698.678735,2546.693527,9.235383e+06


In [10]:
df_surface = trata_csv("data/WBG_data/surface_area.csv", "surface_area_media", skiprows=4)
df = df.merge(df_surface, how="inner", on="Country_Code")
df

AG.SRF.TOTL.K2 <class 'str'>


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media,Control_of_Corruption_media,Expenditure_on_education_media,GDP_per_capita_media,GDP_per_capitaPPP_media,population_media,surface_area_media
0,ABW,99.472445,0.103728,1.139387,5.325764,21735.275013,33223.072570,7.790650e+04,1.800000e+02
1,AFE,31.756021,13.177775,,4.410762,822.978192,2830.148059,3.579699e+08,1.512053e+07
2,AFG,57.091304,27.952003,-1.420090,2.956045,416.667609,1715.017152,1.878244e+07,6.528600e+05
3,AFW,41.742255,22.446226,,2.843056,881.552220,3104.291490,2.438662e+08,9.166270e+06
4,AGO,36.286957,7.883305,-1.233098,3.034522,2133.920034,5274.217573,1.518896e+07,1.246700e+06
...,...,...,...,...,...,...,...,...,...
261,XKX,100.000000,7.986133,-0.444194,,4005.794796,9510.311782,1.664839e+06,
262,YEM,57.514358,18.149629,-1.245655,6.097335,903.833244,2859.162749,1.612413e+07,5.279700e+05
263,ZAF,80.067227,4.320056,0.130031,5.182884,3602.799975,10784.240237,3.894634e+07,1.219090e+06
264,ZMB,25.470565,12.357125,-0.564180,3.718272,698.678735,2546.693527,9.235383e+06,7.526100e+05


In [11]:
# datos de paises

def tratar_columnas(col):
	col = re.sub(r"\s", "_", col)

	while "__" in col:
		col = col.replace("__", "_")

	return col

df_datos = pd.read_csv("data/WBG_data/WDICountry.csv")
df_datos.columns = [tratar_columnas(col) for col in df_datos.columns]
df_datos = df_datos[
	['Country_Code', 'Short_Name', 'Table_Name', 'Long_Name', '2-alpha_code',
       'Currency_Unit', 'Region', 'WB-2_code']
]

df_datos

Unnamed: 0,Country_Code,Short_Name,Table_Name,Long_Name,2-alpha_code,Currency_Unit,Region,WB-2_code
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,Latin America & Caribbean,AW
1,AFE,Africa Eastern and Southern,Africa Eastern and Southern,Africa Eastern and Southern,ZH,,,ZH
2,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,South Asia,AF
3,AFW,Africa Western and Central,Africa Western and Central,Africa Western and Central,ZI,,,ZI
4,AGO,Angola,Angola,People's Republic of Angola,AO,Angolan kwanza,Sub-Saharan Africa,AO
...,...,...,...,...,...,...,...,...
260,XKX,Kosovo,Kosovo,Republic of Kosovo,XK,Euro,Europe & Central Asia,XK
261,YEM,Yemen,"Yemen, Rep.",Republic of Yemen,YE,Yemeni rial,Middle East & North Africa,RY
262,ZAF,South Africa,South Africa,Republic of South Africa,ZA,South African rand,Sub-Saharan Africa,ZA
263,ZMB,Zambia,Zambia,Republic of Zambia,ZM,New Zambian kwacha,Sub-Saharan Africa,ZM


In [12]:
df = df_datos.merge(df, how="outer", on="Country_Code")
df["Control_of_Corruption_media"] = df.apply(lambda x: np.nan if x["Control_of_Corruption_media"] == ".." else float(x["Control_of_Corruption_media"]), axis=1)
df = df[~df.Short_Name.isnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265 entries, 0 to 264
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country_Code                    265 non-null    object 
 1   Short_Name                      265 non-null    object 
 2   Table_Name                      265 non-null    object 
 3   Long_Name                       265 non-null    object 
 4   2-alpha_code                    263 non-null    object 
 5   Currency_Unit                   217 non-null    object 
 6   Region                          217 non-null    object 
 7   WB-2_code                       264 non-null    object 
 8   Access_to_electricity_media     264 non-null    float64
 9   AFFVA_media                     253 non-null    float64
 10  Control_of_Corruption_media     205 non-null    float64
 11  Expenditure_on_education_media  247 non-null    float64
 12  GDP_per_capita_media            262 

In [13]:
df.to_csv("data/wbg_data.cvs", encoding="utf-8", index=False)

In [14]:
metadata

Unnamed: 0,Indicador,codigo_indicador,anio_inicio,anio_final
0,Access_to_electricity_media,EG.ELC.ACCS.ZS,1990,2022
1,AFFVA_media,NV.AGR.TOTL.ZS,1960,2023
2,Control_of_Corruption_media,CC.EST,1996,2022
3,Expenditure_on_education_media,SE.XPD.TOTL.GD.ZS,1970,2023
4,GDP_per_capita_media,NY.GDP.PCAP.CD,1960,2023
5,GDP_per_capitaPPP_media,NY.GDP.PCAP.PP.CD,1990,2023
6,population_media,SP.POP.TOTL,1960,2023
7,surface_area_media,AG.SRF.TOTL.K2,1961,2021
