In [1]:
import requests
import pandas as pd
import numpy as np
import re
import os


In [2]:
codigos_indicadores = dict(zip([
	'Access_to_electricity_media',
	'AFFVA_media',
	'Control_of_Corruption_media',
	'Expenditure_on_education_media',
	'GDP_per_capita_media',
	'population_media',
	'surface_area_media'
 ],[
	'EG.ELC.ACCS.ZS',
	'NV.AGR.TOTL.ZS',
	'CC.EST',
	'SE.XPD.TOTL.GD.ZS',
	'NY.GDP.PCAP.CD',
	'SP.POP.TOTL',
	'AG.SRF.TOTL.K2'
 ]))

codigos_indicadores

{'Access_to_electricity_media': 'EG.ELC.ACCS.ZS',
 'AFFVA_media': 'NV.AGR.TOTL.ZS',
 'Control_of_Corruption_media': 'CC.EST',
 'Expenditure_on_education_media': 'SE.XPD.TOTL.GD.ZS',
 'GDP_per_capita_media': 'NY.GDP.PCAP.CD',
 'population_media': 'SP.POP.TOTL',
 'surface_area_media': 'AG.SRF.TOTL.K2'}

In [3]:
class WBGData:
	def __init__(self):
		self._base_url = "https://api.worldbank.org/v2"
		self._params = {
			"format" : "json",
			"per_page" : 500,
			"page" : 1
		}

		self._data = dict()

	def get_indicator(self, indicator_code, pandas=True, pandas_format_long=True, params={}):
		endpoint = f"country/all/indicator/{indicator_code}"

		df = self._process_data(endpoint, params=params, pandas=pandas)

		if pandas_format_long or not pandas:
			return df
		else:
			return WBGData.dar_formato_indicador(df)

	def get_metadata(self, data, pandas=True, params={}):
		endpoint = f"{data}/all"

		return self._process_data(endpoint, params=params, pandas=pandas, type_="metadata")


	def _process_data(self, endpoint, pandas=True, type_="indicator", params={}):
		self._get(endpoint, params=params)

		if pandas:
			df = pd.json_normalize(self._data, sep="_")
			df = df.replace("", np.nan)

			if type_ == "indicator":
				df = df.sort_values(by=["countryiso3code"]).reset_index(drop=True)
			elif "topics" in df.columns and type_ == "metadata":
				df[["topics_id", "topics_value"]] = df.apply(WBGData.tratar_columna_topics, axis=1)
				df = df.drop(columns=["topics"])

			return df
		else:
			return self._data

	def _get(self, endpoint, params={}):
		self._params["page"] = 1
		self._params.update(params)
		full_url = f"{self._base_url}/{endpoint}"

		all_data = []

		while True:
			response = requests.get(full_url, self._params)
			data = response.json()

			metadata = data[0]
			if "message" in metadata:
				raise KeyError(metadata["message"])

			pages = metadata["pages"]

			a_page = data[1]

			all_data.extend(a_page.copy())

			self._params["page"] += 1

			if self._params["page"] > pages:
				break

		self._data = all_data

	@staticmethod
	def tratar_columna_topics(series):
		topics = series.topics

		if not topics:
			return pd.Series([np.nan, np.nan])

		id = []
		value = []
		for t in topics:
			for key,v in t.items():
				if key == "id":
					id.append(v)
				elif key == "value":
					value.append(v)

		id = ", ".join(id)
		value = " & ".join(value)

		return pd.Series([id, value])

	@staticmethod
	def dar_formato_indicador(dataframe):
		df = dataframe.copy()

		# rellenar valores nulos en countryiso3code de forma temporal

		map_iso_null = {row :f"iso{index + 1}" for index,row in enumerate(df[df["countryiso3code"].isna()]["country_value"].unique())}
		df["countryiso3code"] = df.apply(lambda x: map_iso_null[x["country_value"]] if pd.isna(x["countryiso3code"]) else x["countryiso3code"],
										axis=1)

		# Pivotear valores de indicadores

		df_valores = df.pivot(index="countryiso3code", columns="date", values="value")
		df_valores = df_valores.reset_index()
		df_valores.columns.name = None

		# Acomodar otros los otros parametros

		df = df[['country_value','countryiso3code','indicator_value','indicator_id']].drop_duplicates()
		df = df.reset_index(drop=True)

		# Unir ambos dataframes

		df_salida = df.merge(df_valores, how="inner", on="countryiso3code")
		df_salida = df_salida.rename(columns={
			'country_value' : "Country Name",
			'countryiso3code' : "Country Code",
			'indicator_value': "Indicator Name",
			'indicator_id' : "Indicator Code"
		})

		# Eliminar valores nulos temporales en countryiso3code, ahora Country Code

		df_salida["Country Code"] = df_salida.apply(lambda x: np.nan if re.match(r"iso\d+", x["Country Code"]) else x["Country Code"], axis=1)

		return df_salida

In [4]:
metadata = pd.DataFrame(columns=["Indicador",
		"codigo_indicador",
		"anio_inicio",
		"anio_final"])

def trata_csv(dataset, indicador, first=False, skiprows=0):
	if isinstance(dataset, pd.DataFrame):
		dataframe = dataset.copy()
	elif os.path.isfile(dataset):
		dataframe = pd.read_csv(dataset, skiprows=skiprows)
	else:
		return

	dataframe = dataframe.dropna(how="all", axis=1)
	columns = dataframe.columns[dataframe.apply(lambda col: col.dtype == "float64")]
	dataframe[indicador] = dataframe[columns].dropna(how="all").mean(axis=1)

	max_ = max(columns, key=lambda x: int(x))
	min_ = min(columns, key=lambda x: int(x))

	metadata.loc[len(metadata)] = {
		"Indicador" : indicador,
		"codigo_indicador" : dataframe.iloc[1]["Indicator Code"],
		"anio_inicio" : min_,
		"anio_final" : max_
	}

	if not first:
		dataframe = dataframe[["Country Code", indicador]]
		dataframe = dataframe.rename(columns={"Country Code" : "Country_Code"})
	else:
		dataframe = dataframe[['Country Name', "Country Code", indicador]]
		dataframe = dataframe.rename(columns={'Country Name' : 'Country_Name', "Country Code" : "Country_Code"})

	return dataframe

In [5]:
wbg = WBGData()

In [6]:
def extraer_indicadores():
	df = pd.DataFrame()
	max_space = max([len(i) for i in codigos_indicadores.keys()]) + 2
	for nombre,codigo in codigos_indicadores.items():
		print(f"Extrayendo indicador: {nombre:>{max_space}}", end="\r")
		dataset = wbg.get_indicator(codigo, pandas_format_long=False)
		df_indicador = trata_csv(dataset, nombre, skiprows=4)
		df_indicador = df_indicador.dropna(subset=["Country_Code"])
		try:
			df = df.merge(df_indicador, how="inner", on="Country_Code")
		except KeyError:
			df = df_indicador.copy()
	print("Completado!!")
	return df

df = extraer_indicadores()
df

Completado!!ndicador:               surface_area_media


Unnamed: 0,Country_Code,Access_to_electricity_media,AFFVA_media,Control_of_Corruption_media,Expenditure_on_education_media,GDP_per_capita_media,population_media,surface_area_media
0,ABW,99.472445,0.103728,1.139387,5.325764,21735.275013,7.790650e+04,1.800000e+02
1,AFE,31.756021,13.177775,,4.410762,822.978192,3.579699e+08,1.512053e+07
2,AFG,57.091304,27.952003,-1.420090,2.956045,416.667609,1.878244e+07,6.528600e+05
3,AFW,41.742255,22.446226,,2.843056,881.552220,2.438662e+08,9.166270e+06
4,AGO,36.286957,7.883305,-1.233098,3.034522,2133.920034,1.518896e+07,1.246700e+06
...,...,...,...,...,...,...,...,...
256,XKX,100.000000,7.986133,-0.444194,,4005.794796,1.664839e+06,
257,YEM,57.514358,18.149629,-1.245655,6.097335,903.833244,1.612413e+07,5.279700e+05
258,ZAF,80.067227,4.320056,0.130031,5.182884,3602.799975,3.894634e+07,1.219090e+06
259,ZMB,25.470565,12.357125,-0.564180,3.718272,698.678735,9.235383e+06,7.526100e+05


In [7]:
# datos de paises

def tratar_columnas(col):
	col = re.sub(r"\s", "_", col)

	while "__" in col:
		col = col.replace("__", "_")

	return col

df_datos = pd.read_csv("data/WBG_data/WDICountry.csv")
df_datos.columns = [tratar_columnas(col) for col in df_datos.columns]
df_datos = df_datos[
	['Country_Code', 'Short_Name', 'Table_Name', 'Long_Name', '2-alpha_code',
       'Currency_Unit', 'Region', 'WB-2_code']
]

df_datos

Unnamed: 0,Country_Code,Short_Name,Table_Name,Long_Name,2-alpha_code,Currency_Unit,Region,WB-2_code
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,Latin America & Caribbean,AW
1,AFE,Africa Eastern and Southern,Africa Eastern and Southern,Africa Eastern and Southern,ZH,,,ZH
2,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,South Asia,AF
3,AFW,Africa Western and Central,Africa Western and Central,Africa Western and Central,ZI,,,ZI
4,AGO,Angola,Angola,People's Republic of Angola,AO,Angolan kwanza,Sub-Saharan Africa,AO
...,...,...,...,...,...,...,...,...
260,XKX,Kosovo,Kosovo,Republic of Kosovo,XK,Euro,Europe & Central Asia,XK
261,YEM,Yemen,"Yemen, Rep.",Republic of Yemen,YE,Yemeni rial,Middle East & North Africa,RY
262,ZAF,South Africa,South Africa,Republic of South Africa,ZA,South African rand,Sub-Saharan Africa,ZA
263,ZMB,Zambia,Zambia,Republic of Zambia,ZM,New Zambian kwacha,Sub-Saharan Africa,ZM


In [8]:
df = df_datos.merge(df, how="outer", on="Country_Code")
df["Control_of_Corruption_media"] = df.apply(lambda x: np.nan if x["Control_of_Corruption_media"] == ".." else float(x["Control_of_Corruption_media"]), axis=1)
df = df[~df.Short_Name.isnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265 entries, 0 to 264
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country_Code                    265 non-null    object 
 1   Short_Name                      265 non-null    object 
 2   Table_Name                      265 non-null    object 
 3   Long_Name                       265 non-null    object 
 4   2-alpha_code                    263 non-null    object 
 5   Currency_Unit                   217 non-null    object 
 6   Region                          217 non-null    object 
 7   WB-2_code                       264 non-null    object 
 8   Access_to_electricity_media     260 non-null    float64
 9   AFFVA_media                     249 non-null    float64
 10  Control_of_Corruption_media     205 non-null    float64
 11  Expenditure_on_education_media  243 non-null    float64
 12  GDP_per_capita_media            258 

In [9]:
df.to_csv("data/wbg_data.cvs", encoding="utf-8", index=False)

In [10]:
metadata

Unnamed: 0,Indicador,codigo_indicador,anio_inicio,anio_final
0,Access_to_electricity_media,EG.ELC.ACCS.ZS,1990,2022
1,AFFVA_media,NV.AGR.TOTL.ZS,1960,2023
2,Control_of_Corruption_media,CC.EST,1996,2022
3,Expenditure_on_education_media,SE.XPD.TOTL.GD.ZS,1970,2023
4,GDP_per_capita_media,NY.GDP.PCAP.CD,1960,2023
5,population_media,SP.POP.TOTL,1960,2023
6,surface_area_media,AG.SRF.TOTL.K2,1961,2021


In [11]:
metadata.Indicador.to_list()

['Access_to_electricity_media',
 'AFFVA_media',
 'Control_of_Corruption_media',
 'Expenditure_on_education_media',
 'GDP_per_capita_media',
 'population_media',
 'surface_area_media']