In [11]:
import requests
import pandas as pd
import numpy as np
import re
import os


In [12]:
codigos_indicadores = {
	'Access_to_electricity_media': 'EG.ELC.ACCS.ZS',
	'AFFVA_media': 'NV.AGR.TOTL.ZS',
	"Agricultural_land_media" : "AG.LND.AGRI.ZS",
	"Cause_of_death_by_media" : "SH.DTH.COMM.ZS",
	'Control_of_Corruption_media': 'CC.EST',
	'Health_expenditure_per_capita_media': 'SH.XPD.CHEX.PC.CD',
	'Expenditure_on_education_media': 'SE.XPD.TOTL.GD.ZS',
	'GDP_per_capita_media': 'NY.GDP.PCAP.CD',
	"GDP_per_capita_growth_media" : "NY.GDP.PCAP.KD.ZG",
	"Food_insecurity_media" : "SN.ITK.SVFI.ZS",
	"Prevalence_of_undernourishment_media" : "SN.ITK.DEFC.ZS",
	'population_media': 'SP.POP.TOTL',
	'surface_area_media': 'AG.SRF.TOTL.K2',
	"Vulnerable_employment_media" : "SL.EMP.VULN.ZS",
	"Research_dev_expenditure_media" : "GB.XPD.RSDV.GD.ZS",
	"Primary_completion_rate_media" : "SE.PRM.CMPT.ZS"
}
codigos_indicadores = dict(sorted(codigos_indicadores.items()))
codigos_indicadores

{'AFFVA_media': 'NV.AGR.TOTL.ZS',
 'Access_to_electricity_media': 'EG.ELC.ACCS.ZS',
 'Agricultural_land_media': 'AG.LND.AGRI.ZS',
 'Cause_of_death_by_media': 'SH.DTH.COMM.ZS',
 'Control_of_Corruption_media': 'CC.EST',
 'Expenditure_on_education_media': 'SE.XPD.TOTL.GD.ZS',
 'Food_insecurity_media': 'SN.ITK.SVFI.ZS',
 'GDP_per_capita_growth_media': 'NY.GDP.PCAP.KD.ZG',
 'GDP_per_capita_media': 'NY.GDP.PCAP.CD',
 'Health_expenditure_per_capita_media': 'SH.XPD.CHEX.PC.CD',
 'Prevalence_of_undernourishment_media': 'SN.ITK.DEFC.ZS',
 'Primary_completion_rate_media': 'SE.PRM.CMPT.ZS',
 'Research_dev_expenditure_media': 'GB.XPD.RSDV.GD.ZS',
 'Vulnerable_employment_media': 'SL.EMP.VULN.ZS',
 'population_media': 'SP.POP.TOTL',
 'surface_area_media': 'AG.SRF.TOTL.K2'}

In [13]:
class WBGData:
	def __init__(self):
		self._base_url = "https://api.worldbank.org/v2"
		self._params = {
			"format" : "json",
			"per_page" : 500,
			"page" : 1
		}

		self._data = dict()

	def get_indicator(self, indicator_code, pandas=True, pandas_format_long=True, params={}):
		endpoint = f"country/all/indicator/{indicator_code}"

		df = self._process_data(endpoint, params=params, pandas=pandas)

		if pandas_format_long or not pandas:
			return df
		else:
			return WBGData.dar_formato_indicador(df)

	def get_metadata(self, data, pandas=True, params={}):
		endpoint = f"{data}/all"

		return self._process_data(endpoint, params=params, pandas=pandas, type_="metadata")

	def get_metadata_of_indicator(self, code, pandas=True, params={}):
		endpoint = f"indicator/{code}"

		return self._process_data(endpoint, params=params, pandas=pandas, type_="metadata")


	def _process_data(self, endpoint, pandas=True, type_="indicator", params={}):
		self._get(endpoint, params=params)

		if pandas:
			df = pd.json_normalize(self._data, sep="_")
			df = df.replace("", np.nan)

			if type_ == "indicator":
				df = df.sort_values(by=["countryiso3code"]).reset_index(drop=True)
			elif "topics" in df.columns and type_ == "metadata":
				df[["topics_id", "topics_value"]] = df.apply(WBGData.tratar_columna_topics, axis=1)
				df = df.drop(columns=["topics"])

			return df
		else:
			return self._data

	def _get(self, endpoint, params={}):
		self._params["page"] = 1
		self._params.update(params)
		full_url = f"{self._base_url}/{endpoint}"

		all_data = []

		while True:
			response = requests.get(full_url, self._params)
			data = response.json()

			metadata = data[0]
			if "message" in metadata:
				raise KeyError(metadata["message"])

			pages = metadata["pages"]

			a_page = data[1]

			all_data.extend(a_page.copy())

			self._params["page"] += 1

			if self._params["page"] > pages:
				break

		self._data = all_data

	@staticmethod
	def tratar_columna_topics(series):
		topics = series.topics

		if not topics:
			return pd.Series([np.nan, np.nan])

		id = []
		value = []
		for t in topics:
			for key,v in t.items():
				if key == "id":
					id.append(v)
				elif key == "value":
					value.append(v)

		id = ", ".join(id)
		value = " & ".join(value)

		return pd.Series([id, value])

	@staticmethod
	def dar_formato_indicador(dataframe):
		df = dataframe.copy()

		# rellenar valores nulos en countryiso3code de forma temporal

		map_iso_null = {row :f"iso{index + 1}" for index,row in enumerate(df[df["countryiso3code"].isna()]["country_value"].unique())}
		df["countryiso3code"] = df.apply(lambda x: map_iso_null[x["country_value"]] if pd.isna(x["countryiso3code"]) else x["countryiso3code"],
										axis=1)

		# Pivotear valores de indicadores

		df_valores = df.pivot(index="countryiso3code", columns="date", values="value")
		df_valores = df_valores.reset_index()
		df_valores.columns.name = None

		# Acomodar otros los otros parametros

		df = df[['country_value','countryiso3code','indicator_value','indicator_id']].drop_duplicates()
		df = df.reset_index(drop=True)

		# Unir ambos dataframes

		df_salida = df.merge(df_valores, how="inner", on="countryiso3code")
		df_salida = df_salida.rename(columns={
			'country_value' : "Country Name",
			'countryiso3code' : "Country Code",
			'indicator_value': "Indicator Name",
			'indicator_id' : "Indicator Code"
		})

		# Eliminar valores nulos temporales en countryiso3code, ahora Country Code

		df_salida["Country Code"] = df_salida.apply(lambda x: np.nan if re.match(r"iso\d+", x["Country Code"]) else x["Country Code"], axis=1)

		return df_salida

In [14]:
metadata = pd.DataFrame(columns=["Indicador",
		"codigo_indicador",
		"anio_inicio",
		"anio_final"])

def trata_csv(dataset, indicador, first=False, skiprows=0):
	if isinstance(dataset, pd.DataFrame):
		dataframe = dataset.copy()
	elif os.path.isfile(dataset):
		dataframe = pd.read_csv(dataset, skiprows=skiprows)
	else:
		return

	dataframe = dataframe.dropna(how="all", axis=1)
	columns = dataframe.columns[dataframe.apply(lambda col: col.dtype == "float64")]
	dataframe[indicador] = dataframe[columns].dropna(how="all").mean(axis=1)

	max_ = max(columns, key=lambda x: int(x))
	min_ = min(columns, key=lambda x: int(x))

	metadata.loc[len(metadata)] = {
		"Indicador" : indicador,
		"codigo_indicador" : dataframe.iloc[1]["Indicator Code"],
		"anio_inicio" : min_,
		"anio_final" : max_
	}

	if not first:
		dataframe = dataframe[["Country Code", indicador]]
		dataframe = dataframe.rename(columns={"Country Code" : "Country_Code"})
	else:
		dataframe = dataframe[['Country Name', "Country Code", indicador]]
		dataframe = dataframe.rename(columns={'Country Name' : 'Country_Name', "Country Code" : "Country_Code"})

	return dataframe

In [15]:
wbg = WBGData()

In [16]:
def extraer_indicadores():
	global metadata
	df = pd.DataFrame()
	max_space = max([len(i) for i in codigos_indicadores.keys()]) + 2
	df_metadata = pd.DataFrame()

	for nombre,codigo in codigos_indicadores.items():
		print(f"Extrayendo indicador: {nombre:>{max_space}}", end="\r")
		dataset = wbg.get_indicator(codigo, pandas_format_long=False)
		df_metadata_indicator = wbg.get_metadata_of_indicator(codigo)
		df_metadata = pd.concat([df_metadata, df_metadata_indicator], ignore_index=True)
		df_indicador = trata_csv(dataset, nombre, skiprows=4)
		df_indicador = df_indicador.dropna(subset=["Country_Code"])
		try:
			df = df.merge(df_indicador, how="inner", on="Country_Code")
		except KeyError:
			df = df_indicador.copy()

	print("Completado!!")
	metadata = metadata.merge(df_metadata, how="left", left_on="codigo_indicador", right_on="id")
	metadata = metadata.drop(columns="id")
	return df

df = extraer_indicadores()
df

Completado!!ndicador:                     surface_area_media


Unnamed: 0,Country_Code,AFFVA_media,Access_to_electricity_media,Agricultural_land_media,Cause_of_death_by_media,Control_of_Corruption_media,Expenditure_on_education_media,Food_insecurity_media,GDP_per_capita_growth_media,GDP_per_capita_media,Health_expenditure_per_capita_media,Prevalence_of_undernourishment_media,Primary_completion_rate_media,Research_dev_expenditure_media,Vulnerable_employment_media,population_media,surface_area_media
0,ABW,0.103728,99.472445,11.111111,,1.139387,5.325764,,2.308296,21735.275013,,,98.165097,,,7.790650e+04,1.800000e+02
1,AFE,13.177775,31.756021,43.959595,57.032263,,4.410762,,0.359678,822.978192,82.340549,25.814769,56.787163,0.619447,73.757455,3.579699e+08,1.512053e+07
2,AFG,27.952003,57.091304,58.200348,45.071499,-1.420090,2.956045,19.328571,1.152146,416.667609,49.495439,27.938095,45.264939,,88.861321,1.878244e+07,6.528600e+05
3,AFW,22.446226,41.742255,35.758280,64.447321,,2.843056,,0.898172,881.552220,54.887711,12.997012,60.471626,0.151699,81.822756,2.438662e+08,9.166270e+06
4,AGO,7.883305,36.286957,36.018630,65.045936,-1.233098,3.034522,26.700000,-0.167378,2133.920034,82.931208,31.590476,41.712434,0.032290,68.868918,1.518896e+07,1.246700e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,XKX,7.986133,100.000000,,,-0.444194,,,4.468208,4005.794796,,,,,,1.664839e+06,
257,YEM,18.149629,57.514358,44.513458,37.310949,-1.245655,6.097335,11.485714,-1.135928,903.833244,51.806669,31.047619,58.005581,,44.339272,1.612413e+07,5.279700e+05
258,ZAF,4.320056,80.067227,79.346281,42.427386,0.130031,5.182884,7.850000,0.690067,3602.799975,469.418586,4.976190,87.440464,0.700868,20.514348,3.894634e+07,1.219090e+06
259,ZMB,12.357125,25.470565,28.870820,63.128580,-0.564180,3.718272,27.057143,0.274728,698.678735,55.232332,41.395238,78.625864,0.051096,78.755861,9.235383e+06,7.526100e+05


In [17]:
df_geografic = wbg.get_metadata("country")[["id", "longitude", "latitude"]]
df_geografic = df_geografic.rename(columns={"id" : "Country_Code"})
df = df_geografic.merge(df, how="left", on="Country_Code")
df

Unnamed: 0,Country_Code,longitude,latitude,AFFVA_media,Access_to_electricity_media,Agricultural_land_media,Cause_of_death_by_media,Control_of_Corruption_media,Expenditure_on_education_media,Food_insecurity_media,GDP_per_capita_growth_media,GDP_per_capita_media,Health_expenditure_per_capita_media,Prevalence_of_undernourishment_media,Primary_completion_rate_media,Research_dev_expenditure_media,Vulnerable_employment_media,population_media,surface_area_media
0,ABW,-70.0167,12.5167,0.103728,99.472445,11.111111,,1.139387,5.325764,,2.308296,21735.275013,,,98.165097,,,7.790650e+04,1.800000e+02
1,AFE,,,13.177775,31.756021,43.959595,57.032263,,4.410762,,0.359678,822.978192,82.340549,25.814769,56.787163,0.619447,73.757455,3.579699e+08,1.512053e+07
2,AFG,69.1761,34.5228,27.952003,57.091304,58.200348,45.071499,-1.420090,2.956045,19.328571,1.152146,416.667609,49.495439,27.938095,45.264939,,88.861321,1.878244e+07,6.528600e+05
3,AFR,,,,,,,,,,,,,,,,,,
4,AFW,,,22.446226,41.742255,35.758280,64.447321,,2.843056,,0.898172,881.552220,54.887711,12.997012,60.471626,0.151699,81.822756,2.438662e+08,9.166270e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,XZN,,,,,,,,,,,,,,,,,,
292,YEM,44.2075,15.352,18.149629,57.514358,44.513458,37.310949,-1.245655,6.097335,11.485714,-1.135928,903.833244,51.806669,31.047619,58.005581,,44.339272,1.612413e+07,5.279700e+05
293,ZAF,28.1871,-25.746,4.320056,80.067227,79.346281,42.427386,0.130031,5.182884,7.850000,0.690067,3602.799975,469.418586,4.976190,87.440464,0.700868,20.514348,3.894634e+07,1.219090e+06
294,ZMB,28.2937,-15.3982,12.357125,25.470565,28.870820,63.128580,-0.564180,3.718272,27.057143,0.274728,698.678735,55.232332,41.395238,78.625864,0.051096,78.755861,9.235383e+06,7.526100e+05


In [18]:
# datos de paises

def tratar_columnas(col):
	col = re.sub(r"\s", "_", col)

	while "__" in col:
		col = col.replace("__", "_")

	return col

df_datos = pd.read_csv("data/WBG_data/WDICountry.csv")
df_datos.columns = [tratar_columnas(col) for col in df_datos.columns]
df_datos = df_datos[
	['Country_Code', 'Short_Name', 'Table_Name', 'Long_Name', '2-alpha_code',
       'Currency_Unit', 'Region', 'WB-2_code']
]

df_datos

Unnamed: 0,Country_Code,Short_Name,Table_Name,Long_Name,2-alpha_code,Currency_Unit,Region,WB-2_code
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,Latin America & Caribbean,AW
1,AFE,Africa Eastern and Southern,Africa Eastern and Southern,Africa Eastern and Southern,ZH,,,ZH
2,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,South Asia,AF
3,AFW,Africa Western and Central,Africa Western and Central,Africa Western and Central,ZI,,,ZI
4,AGO,Angola,Angola,People's Republic of Angola,AO,Angolan kwanza,Sub-Saharan Africa,AO
...,...,...,...,...,...,...,...,...
260,XKX,Kosovo,Kosovo,Republic of Kosovo,XK,Euro,Europe & Central Asia,XK
261,YEM,Yemen,"Yemen, Rep.",Republic of Yemen,YE,Yemeni rial,Middle East & North Africa,RY
262,ZAF,South Africa,South Africa,Republic of South Africa,ZA,South African rand,Sub-Saharan Africa,ZA
263,ZMB,Zambia,Zambia,Republic of Zambia,ZM,New Zambian kwacha,Sub-Saharan Africa,ZM


In [19]:
df = df_datos.merge(df, how="outer", on="Country_Code")
df = df[~df.Short_Name.isnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265 entries, 0 to 264
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country_Code                          265 non-null    object 
 1   Short_Name                            265 non-null    object 
 2   Table_Name                            265 non-null    object 
 3   Long_Name                             265 non-null    object 
 4   2-alpha_code                          263 non-null    object 
 5   Currency_Unit                         217 non-null    object 
 6   Region                                217 non-null    object 
 7   WB-2_code                             264 non-null    object 
 8   longitude                             211 non-null    object 
 9   latitude                              211 non-null    object 
 10  AFFVA_media                           249 non-null    float64
 11  Access_to_electrici

In [20]:
df.to_csv("data/wbg_data.cvs", encoding="utf-8", index=False)

In [21]:
metadata = metadata.sort_values(by="Indicador")
metadata = metadata.reset_index(drop=True)
metadata

Unnamed: 0,Indicador,codigo_indicador,anio_inicio,anio_final,name,unit,sourceNote,sourceOrganization,source_id,source_value,topics_id,topics_value
0,AFFVA_media,NV.AGR.TOTL.ZS,1960,2023,"Agriculture, forestry, and fishing, value adde...",,"Agriculture, forestry, and fishing corresponds...","World Bank national accounts data, and OECD Na...",2,World Development Indicators,"1, 3, 19",Agriculture & Rural Development & Economy & ...
1,Access_to_electricity_media,EG.ELC.ACCS.ZS,1990,2022,Access to electricity (% of population),,Access to electricity is the percentage of pop...,"IEA, IRENA, UNSD, World Bank, WHO. 2023. Track...",2,World Development Indicators,"5, 19, 6",Energy & Mining & Climate Change & Environment
2,Agricultural_land_media,AG.LND.AGRI.ZS,1961,2021,Agricultural land (% of land area),,Agricultural land refers to the share of land ...,"Food and Agriculture Organization, electronic ...",2,World Development Indicators,"1, 19, 6",Agriculture & Rural Development & Climate Ch...
3,Cause_of_death_by_media,SH.DTH.COMM.ZS,2000,2019,"Cause of death, by communicable diseases and m...",,Cause of death refers to the share of all deat...,Derived based on the data from Global Health E...,2,World Development Indicators,8,Health
4,Control_of_Corruption_media,CC.EST,1996,2022,Control of Corruption: Estimate,,Control of Corruption captures perceptions of ...,"Detailed documentation of the WGI, interactive...",2,World Development Indicators,13,Public Sector
5,Expenditure_on_education_media,SE.XPD.TOTL.GD.ZS,1970,2023,"Government expenditure on education, total (% ...",,General government expenditure on education (c...,UNESCO Institute for Statistics (UIS). UIS.Sta...,2,World Development Indicators,4,Education
6,Food_insecurity_media,SN.ITK.SVFI.ZS,2014,2021,Prevalence of severe food insecurity in the po...,,The percentage of people in the population who...,Food and Agriculture Organization of the Unite...,2,World Development Indicators,8,Health
7,GDP_per_capita_growth_media,NY.GDP.PCAP.KD.ZG,1961,2023,GDP per capita growth (annual %),,Annual percentage growth rate of GDP per capit...,"World Bank national accounts data, and OECD Na...",2,World Development Indicators,3,Economy & Growth
8,GDP_per_capita_media,NY.GDP.PCAP.CD,1960,2023,GDP per capita (current US$),,GDP per capita is gross domestic product divid...,"World Bank national accounts data, and OECD Na...",2,World Development Indicators,3,Economy & Growth
9,Health_expenditure_per_capita_media,SH.XPD.CHEX.PC.CD,2000,2022,Current health expenditure per capita (current...,,Current expenditures on health per capita in c...,World Health Organization Global Health Expend...,2,World Development Indicators,8,Health


In [22]:
metadata.to_csv("data/WBG_data/indicators_metadata.csv", index=False, encoding="utf-8")