# **World Bank Social Development Dataset:** *ETL*
### Source: [World Bank Social Development](https://data.worldbank.org/topic/social-development?view=chart)

In [121]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import requests
import seaborn as sns

### **(E)xtract the Data:**
**REQUIRED:** *Download the data from [here](https://data.worldbank.org/topic/social-development?view=chart) and save as a CSV in `data` folder.

In [122]:
# Load in csv
data = pd.read_csv('data/API_15_DS2_en_csv_v2_4254232.csv')
region_codes = pd.read_csv('data/Metadata_Country_API_15_DS2_en_csv_v2_4254232.csv')

# Merge dataset and region data files together
data = data.merge(region_codes, how='inner', on='Country Code')

# Get a couple of lines of data to make sure loaded correctly
print(data.head())

  Country Name Country Code  \
0        Aruba          ABW   
1        Aruba          ABW   
2        Aruba          ABW   
3        Aruba          ABW   
4        Aruba          ABW   

                                      Indicator Name     Indicator Code  \
0             Life expectancy at birth, male (years)  SP.DYN.LE00.MA.IN   
1           Life expectancy at birth, female (years)  SP.DYN.LE00.FE.IN   
2  Adolescent fertility rate (births per 1,000 wo...        SP.ADO.TFRT   
3  Refugee population by country or territory of ...     SM.POP.REFG.OR   
4  Refugee population by country or territory of ...        SM.POP.REFG   

       1960      1961    1962     1963     1964     1965  ...     2018  \
0   64.0840   64.4290  64.747  65.0530  65.3560  65.6600  ...  73.6280   
1   67.1290   67.6250  68.065  68.4580  68.8170  69.1650  ...  78.5070   
2  106.2062  102.8116  99.417  94.4542  89.4914  84.5286  ...  21.1736   
3       NaN       NaN     NaN      NaN      NaN      NaN  ...     

### **(T)ransform the Data:**
- Remove **nonessential** columns: `Indicator Code`, `Country Code`, `SpecialNotes`, `TableName`
- Convert individaul year columns to single **`Year`** column
- Handle **`na`** values
- Cast **typing** to values
- Rename `Country Name` and `Indicator Name` to `Country` and `Indicator` to make consistent

In [123]:
# Remove nonessential columns
data = data.drop(columns=['Indicator Code', 'Country Code', 'SpecialNotes', 'TableName'])

# Get a couple of lines of data to make sure updated correctly
print(data.head())

Unnamed: 0,Country Name,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,...,2016,2017,2018,2019,2020,2021,Unnamed: 66,Region,IncomeGroup,Unnamed: 5
0,Aruba,"Life expectancy at birth, male (years)",64.084,64.429,64.747,65.053,65.356,65.66,65.959,66.242,...,73.348,73.488,73.628,73.768,73.908,,,Latin America & Caribbean,High income,
1,Aruba,"Life expectancy at birth, female (years)",67.129,67.625,68.065,68.458,68.817,69.165,69.526,69.917,...,78.237,78.372,78.507,78.641,78.774,,,Latin America & Caribbean,High income,
2,Aruba,"Adolescent fertility rate (births per 1,000 wo...",106.2062,102.8116,99.417,94.4542,89.4914,84.5286,79.5658,74.603,...,23.8416,22.674,21.1736,19.6732,18.1728,,,Latin America & Caribbean,High income,
3,Aruba,Refugee population by country or territory of ...,,,,,,,,,...,,,,,,,,Latin America & Caribbean,High income,
4,Aruba,Refugee population by country or territory of ...,,,,,,,,,...,,,,,,,,Latin America & Caribbean,High income,


In [124]:
# Convert individual year columns into one column
data_new = data[['Country Name', 'Indicator Name', 'Region', 'IncomeGroup']]
years = []
for year in range(1960, 2022):
    this_year = data_new.copy()
    this_year['Year'] = year
    this_year['Value'] = data[str(year)]
    years.append(this_year)
data = pd.concat(years)

# Get a couple of lines of data to make sure updated correctly
print(data.info())

In [126]:
# Handle na values

# Region
data['Region'] = data['Region'].fillna('N/A')

# Region
data['IncomeGroup'] = data['IncomeGroup'].fillna('N/A')

# Value: leave na values as na

# Check to see if handled
print(data.info())

In [127]:
# Ensure values are cast correctly
data['Country Name'] = data['Country Name'].astype(str)
data['Indicator Name'] = data['Indicator Name'].astype(str)
data['Region'] = data['Region'].astype(str)
data['IncomeGroup'] = data['IncomeGroup'].astype(str)

In [133]:
# Rename to get consistent scheming
data = data.rename(columns={'Country Name': 'Country', 'Indicator Name': 'Indicator'})

# Make sure changed
data.head()