In [1]:
import pandas as pd

### World Bank

**Country Page:** https://data.worldbank.org/country/vietnam

Data Catalog: https://api.worldbank.org/v2/en/country/VNM?downloadformat=csv

In [2]:
import requests
import zipfile
import io
import pandas as pd

# URL of the zip file
url = 'https://api.worldbank.org/v2/en/country/VNM?downloadformat=csv'

# Send an HTTP request and get the zip file content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Extract the CSV file starting with 'API' from the zip file
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        csv_filename = None
        for file_name in zip_ref.namelist():
            if file_name.startswith('API') and file_name.endswith('.csv'):
                csv_filename = file_name
                break
        
        if csv_filename:
            # Extract the CSV file
            zip_ref.extract(csv_filename)
            csv_file = csv_filename
        else:
            print('No CSV file starting with "API" found in the zip file.')
            exit(1)
else:
    print('Failed to download the zip file.')
    exit(1)

In [3]:
# Read the CSV file into a pandas DataFrame, ignore first two columns
raw_data = pd.read_csv(csv_file, header=2, index_col=False)
raw_data = raw_data.drop(raw_data.columns[[0, 1]], axis=1)  # drop country name & code
raw_data = raw_data.dropna(axis=1, how='all') # drop any columns that only contain missing values
raw_data.head(5)

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,"Intentional homicides, male (per 100,000 male)",VC.IHR.PSRC.MA.P5,,,,,,,,,...,,,,,,,,,,
1,"Internally displaced persons, new displacement...",VC.IDP.NWCV,,,,,,,,,...,,,,,,,,,,
2,"Voice and Accountability: Percentile Rank, Low...",VA.PER.RNK.LOWER,,,,,,,,,...,5.164319,5.418719,5.418719,5.91133,5.91133,5.31401,6.763285,5.31401,7.729469,
3,Travel services (% of commercial service exports),TX.VAL.TRVL.ZS.WT,,,,,,,,,...,,,,,,,,,,
4,Commercial service exports (current US$),TX.VAL.SERV.CD.WT,,,,,,,,,...,10711000000.0,10970000000.0,11250000000.0,12500000000.0,13070000000.0,14790500000.0,16637000000.0,7600000000.0,3673000000.0,


### Extract Series

In [5]:
def filter_series(dataframe, keywords):
    """
    Filter a DataFrame based on a list of keywords present in the 'Indicator Name' column.

    Parameters:
    - dataframe (pd.DataFrame): The DataFrame to be filtered.
    - keywords (list): A list of keywords to search for in a case-insensitive manner.

    Returns:
    - pd.DataFrame: A filtered DataFrame containing rows where the 'Indicator Name' column
      contains all the specified keywords.

    Example:
    >>> keywords = ['population', 'total']
    >>> filtered_data = filter_series(raw_data, keywords)
    >>> print(filtered_data)
       Indicator Name Indicator Code      1960      1961      1962      1963      1964
    0  Population, total   SP.POP.TOTL  50396429  51882769  53461661  55094115  56774465
    """
    # Convert keywords to lowercase for case-insensitive search
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Initialize a mask with True for all rows
    mask = pd.Series(True, index=dataframe.index)
    
    # Apply the mask for each keyword
    for keyword in keywords_lower:
        mask &= dataframe['Indicator Name'].str.lower().str.contains(keyword)
    
    # Filter the DataFrame using the combined mask
    filtered_data = dataframe[mask]
    
    return filtered_data

In [9]:
def extract_series(row_index):
    '''
    pass in index of row to extract from `raw_data` and return a df in long format
    '''
    # extract row index 1223
    df_wide = raw_data.iloc[[row_index]].reset_index(drop=True)

    print(df_wide.iloc[0, 0], '\n') # print the first value of the first column
    print(df_wide)

    # drop NaN columns
    df_wide = df_wide.dropna(axis=1, how='all')

    # Convert the DataFrame from wide to long format
    df_long = pd.melt(df_wide, id_vars=['Indicator Name', 'Indicator Code'], var_name='date', value_name='value')

    # create new df with only `date` and `value` columns
    df = df_long[['date', 'value']]

    # convert `date` column to datetime (using .loc to avoid copy warning)
    df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")

    print(df.info())

    return df

##### Population

In [8]:
filter_series(raw_data, ['Population, total'])

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
981,"Population, total",SP.POP.TOTL,32718461.0,33621982.0,34533889.0,35526727.0,36509166.0,37466077.0,38388210.0,39282564.0,...,90267739.0,91235504.0,92191398.0,93126529.0,94033048.0,94914330.0,95776716.0,96648685.0,97468029.0,98186856.0


In [12]:
df = extract_series(981)

df.loc[:, 'value'] = df['value'].astype(int)

Population, total 

      Indicator Name Indicator Code        1960        1961        1962  \
0  Population, total    SP.POP.TOTL  32718461.0  33621982.0  34533889.0   

         1963        1964        1965        1966        1967  ...  \
0  35526727.0  36509166.0  37466077.0  38388210.0  39282564.0  ...   

         2013        2014        2015        2016        2017        2018  \
0  90267739.0  91235504.0  92191398.0  93126529.0  94033048.0  94914330.0   

         2019        2020        2021        2022  
0  95776716.0  96648685.0  97468029.0  98186856.0  

[1 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    63 non-null     object 
 1   value   63 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.1+ KB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")
  df.loc[:, 'value'] = df['value'].astype(int)


In [19]:
# export to csv and json
df.to_csv('data/popu.csv', index=False)
df.to_json('data/popu.json', orient='records')

---

##### Unemployment

In [15]:
filter_series(raw_data, ['Unemployment, total'])

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
579,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,,,,,,,,,...,1.32,1.26,1.85,1.85,1.87,1.16,1.68,2.1,2.38,1.923
1310,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.NE.ZS,,,,,,,,,...,1.32,1.26,1.85,1.85,1.87,1.16,1.68,2.1,2.38,1.54


In [17]:
df = extract_series(579)

Unemployment, total (% of total labor force) (modeled ILO estimate) 

                                      Indicator Name  Indicator Code  1960  \
0  Unemployment, total (% of total labor force) (...  SL.UEM.TOTL.ZS   NaN   

   1961  1962  1963  1964  1965  1966  1967  ...  2013  2014  2015  2016  \
0   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...  1.32  1.26  1.85  1.85   

   2017  2018  2019  2020  2021   2022  
0  1.87  1.16  1.68   2.1  2.38  1.923  

[1 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    32 non-null     object 
 1   value   32 non-null     float64
dtypes: float64(1), object(1)
memory usage: 640.0+ bytes
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")


In [20]:
# export to csv and json
df.to_csv('data/unem.csv', index=False)
df.to_json('data/unem.json', orient='records')

---

##### Employment

In [22]:
filter_series(raw_data, ['employment', 'population ratio', '15\+', 'total'])

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
857,"Employment to population ratio, 15+, total (%)...",SL.EMP.TOTL.SP.NE.ZS,,,,,,,,,...,76.01,76.08,75.77,75.14,74.7,74.74,74.36,71.74,71.16,71.96
1197,"Employment to population ratio, 15+, total (%)...",SL.EMP.TOTL.SP.ZS,,,,,,,,,...,76.003,76.069,75.772,75.144,74.706,74.733,74.359,71.741,71.165,72.012


In [24]:
df = extract_series(1197)

Employment to population ratio, 15+, total (%) (modeled ILO estimate) 

                                      Indicator Name     Indicator Code  1960  \
0  Employment to population ratio, 15+, total (%)...  SL.EMP.TOTL.SP.ZS   NaN   

   1961  1962  1963  1964  1965  1966  1967  ...    2013    2014    2015  \
0   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...  76.003  76.069  75.772   

     2016    2017    2018    2019    2020    2021    2022  
0  75.144  74.706  74.733  74.359  71.741  71.165  72.012  

[1 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    32 non-null     object 
 1   value   32 non-null     float64
dtypes: float64(1), object(1)
memory usage: 640.0+ bytes
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")


In [26]:
# export to csv and json
df.to_csv('data/empl.csv', index=False)
df.to_json('data/empl.json', orient='records')

---

##### Participation

In [31]:
filter_series(raw_data, ['Participation rate', 'total', '15\+', 'ilo'])

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
587,"Labor force participation rate, total (% of to...",SL.TLF.CACT.ZS,,,,,,,,,...,77.02,77.04,77.2,76.56,76.13,75.61,75.63,73.28,72.9,73.424


In [32]:
df = extract_series(587)

Labor force participation rate, total (% of total population ages 15+) (modeled ILO estimate) 

                                      Indicator Name  Indicator Code  1960  \
0  Labor force participation rate, total (% of to...  SL.TLF.CACT.ZS   NaN   

   1961  1962  1963  1964  1965  1966  1967  ...   2013   2014  2015   2016  \
0   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...  77.02  77.04  77.2  76.56   

    2017   2018   2019   2020  2021    2022  
0  76.13  75.61  75.63  73.28  72.9  73.424  

[1 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    33 non-null     object 
 1   value   33 non-null     float64
dtypes: float64(1), object(1)
memory usage: 656.0+ bytes
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")


In [34]:
# export to csv and json
df.to_csv('data/part.csv', index=False)
df.to_json('data/part.json', orient='records')

---

##### Growth

In [36]:
filter_series(raw_data, ['growth', 'gdp'])

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1249,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,,,,,,,,...,4.423437,5.293391,5.877862,5.61868,5.909238,6.467176,6.392604,1.937355,1.69939,7.228984
1251,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,,,,,,,,...,5.5535,6.422247,6.987167,6.690009,6.940188,7.464991,7.359281,2.865412,2.561551,8.019798


In [37]:
df = extract_series(1251)

GDP growth (annual %) 

          Indicator Name     Indicator Code  1960  1961  1962  1963  1964  \
0  GDP growth (annual %)  NY.GDP.MKTP.KD.ZG   NaN   NaN   NaN   NaN   NaN   

   1965  1966  1967  ...    2013      2014      2015      2016      2017  \
0   NaN   NaN   NaN  ...  5.5535  6.422247  6.987167  6.690009  6.940188   

       2018      2019      2020      2021      2022  
0  7.464991  7.359281  2.865412  2.561551  8.019798  

[1 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    38 non-null     object 
 1   value   38 non-null     float64
dtypes: float64(1), object(1)
memory usage: 736.0+ bytes
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")


In [39]:
# export to csv and json
df.to_csv('data/grow.csv', index=False)
df.to_json('data/grow.json', orient='records')

---

##### Inflation

In [40]:
filter_series(raw_data, ['Inflation, consumer prices'])

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
393,"Inflation, consumer prices (annual %)",FP.CPI.TOTL.ZG,,,,,,,,,...,6.592675,4.084554,0.631201,2.668248,3.520257,3.539628,2.795824,3.220934,1.834716,3.156507


In [41]:
df = extract_series(393)

Inflation, consumer prices (annual %) 

                          Indicator Name  Indicator Code  1960  1961  1962  \
0  Inflation, consumer prices (annual %)  FP.CPI.TOTL.ZG   NaN   NaN   NaN   

   1963  1964  1965  1966  1967  ...      2013      2014      2015      2016  \
0   NaN   NaN   NaN   NaN   NaN  ...  6.592675  4.084554  0.631201  2.668248   

       2017      2018      2019      2020      2021      2022  
0  3.520257  3.539628  2.795824  3.220934  1.834716  3.156507  

[1 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    27 non-null     object 
 1   value   27 non-null     float64
dtypes: float64(1), object(1)
memory usage: 560.0+ bytes
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")


In [43]:
# export to csv and json
df.to_csv('data/infl.csv', index=False)
df.to_json('data/infl.json', orient='records')

---

##### Inequality

In [44]:
filter_series(raw_data, ['gini'])

Unnamed: 0,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1208,Gini index,SI.POV.GINI,,,,,,,,,...,,34.8,,35.3,,35.7,,36.8,,


In [45]:
# Extract 'Gini index' at row 695
df = extract_series(1208)

Gini index 

  Indicator Name Indicator Code  1960  1961  1962  1963  1964  1965  1966  \
0     Gini index    SI.POV.GINI   NaN   NaN   NaN   NaN   NaN   NaN   NaN   

   1967  ...  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  
0   NaN  ...   NaN  34.8   NaN  35.3   NaN  35.7   NaN  36.8   NaN   NaN  

[1 rows x 65 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    12 non-null     object 
 1   value   12 non-null     float64
dtypes: float64(1), object(1)
memory usage: 320.0+ bytes
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y').dt.strftime("%Y-%m-%d")


In [46]:
df

Unnamed: 0,date,value
0,1992-01-01,35.7
1,1997-01-01,35.4
2,2002-01-01,37.0
3,2004-01-01,36.8
4,2006-01-01,35.8
5,2008-01-01,35.6
6,2010-01-01,39.3
7,2012-01-01,35.6
8,2014-01-01,34.8
9,2016-01-01,35.3


In [47]:
# export to csv and json
df.to_csv('data/ineq.csv', index=False)
df.to_json('data/ineq.json', orient='records')