## Import Libraries..

In [1]:
import numpy as np
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup

## Data Engineering..!!

### If we are working as a Data Engineer and your job is to create a dataset such that the Machine Learning Engineers and the Data Scientist can turn this dataset into business value for your company? In both cases, you should be familiar with the Extract-Transform-Load (ETL) pipeline..

## Extract-Transform-Load (ETL) pipeline..

### ETL is a process that allows you to extract data from various sources, transform it according to your requirements and finally load it into a database or data format of your choice..

# Start Working..

## Fetch happiness data from wikipedia..

In [2]:
# get html data first..
html_data = requests.get('https://en.wikipedia.org/wiki/World_Happiness_Report')
print(html_data.status_code)

200


In [42]:
# parse data using bs4..
soup = BeautifulSoup(html_data.text, 'html.parser')

# get all tables from wikipedia..
wiki_tables = soup.find_all('table', {'class' : 'wikitable'})

# target table..
table = wiki_tables[0]

# convert html table to pandas dataframe..
table_data = pd.read_html(str(table))
happy_df = pd.DataFrame(table_data[0])

In [43]:
len(wiki_tables)

5

In [44]:
happy_df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.809,1.285,1.5,0.961,0.662,0.16,0.478
1,2,Denmark,7.646,1.327,1.503,0.979,0.665,0.243,0.495
2,3,Switzerland,7.56,1.391,1.472,1.041,0.629,0.269,0.408
3,4,Iceland,7.504,1.327,1.548,1.001,0.662,0.362,0.145
4,5,Norway,7.488,1.424,1.495,1.008,0.67,0.288,0.434


In [45]:
happy_df.shape

(153, 9)

## Fetch population data from RapidAPI..

In [46]:
from tqdm import tqdm

In [61]:
url = 'https://world-population.p.rapidapi.com/population'

In [62]:
headers = {
    'x-rapidapi-host': 'world-population.p.rapidapi.com',
    'x-rapidapi-key': '576657af71msh7426f5595bee862p1652cajsnbc6d548e3723'
  }

In [63]:
# add population column to existing dataframe..
happy_df['Population'] = np.nan

In [65]:
# loop over to all countries and get population..
for country in tqdm(happy_df['Country or region'].to_list()):
    # create querystring for API..
    query_str = {'country_name' : country}
    
    # create request and fetch response..
    response = requests.request("GET", url, headers = headers, params = query_str)
    
    # add population to dataframe..
    resp_dict = json.loads(response.text)
    if resp_dict["ok"] == True:
        population = resp_dict['body']['population']
        happy_df.loc[happy_df['Country or region'] == country, 'Population'] = population

100%|████████████████████████████████████████████████████████████████████████████████| 153/153 [03:11<00:00,  1.23s/it]


In [66]:
happy_df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Population
0,1,Finland,7.809,1.285,1.5,0.961,0.662,0.16,0.478,5540720.0
1,2,Denmark,7.646,1.327,1.503,0.979,0.665,0.243,0.495,5792202.0
2,3,Switzerland,7.56,1.391,1.472,1.041,0.629,0.269,0.408,8654622.0
3,4,Iceland,7.504,1.327,1.548,1.001,0.662,0.362,0.145,341243.0
4,5,Norway,7.488,1.424,1.495,1.008,0.67,0.288,0.434,5421241.0


In [67]:
happy_df.shape

(153, 10)

## Fetch average age data from world data website..

In [68]:
html_data = requests.get("https://www.worlddata.info/average-age.php")
print(html_data.status_code)

200


In [69]:
# parse data using bs4..
soup = BeautifulSoup(html_data.text, 'html.parser')

# get all tables from wikipedia..
wiki_tables = soup.find_all('table', {'class' : 'std100 hover'})

# target table..
age_table = wiki_tables[0]

# convert table in pandas dataframe..
age_data = pd.read_html(str(age_table))
avg_age_df = pd.DataFrame(age_data[0]) 

In [70]:
avg_age_df.head()

Unnamed: 0,Country,Median agein years,Population under20 years old,Life expectancyin years
0,Japan,45.9,17.1 %,84.5
1,Germany,45.5,18.8 %,81.0
2,Italy,44.3,17.9 %,83.3
3,Bulgaria,43.0,19.1 %,75.1
4,Greece,42.8,18.9 %,82.0


In [71]:
avg_age_df.shape

(117, 4)

In [72]:
# let's use pandas join functionality for joining these tables together
final_df = happy_df.set_index('Country or region').join(avg_age_df.set_index('Country')).reset_index()

In [73]:
final_df.head()

Unnamed: 0,Country or region,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Population,Median agein years,Population under20 years old,Life expectancyin years
0,Finland,1,7.809,1.285,1.5,0.961,0.662,0.16,0.478,5540720.0,42.3,21.4 %,81.9
1,Denmark,2,7.646,1.327,1.503,0.979,0.665,0.243,0.495,5792202.0,41.1,22.3 %,81.3
2,Switzerland,3,7.56,1.391,1.472,1.041,0.629,0.269,0.408,8654622.0,42.0,19.9 %,83.8
3,Iceland,4,7.504,1.327,1.548,1.001,0.662,0.362,0.145,341243.0,,,
4,Norway,5,7.488,1.424,1.495,1.008,0.67,0.288,0.434,5421241.0,,,


In [74]:
final_df.shape

(153, 13)

In [84]:
# Compute GDP by using GDP per capita and the Population columns
final_df['GDP'] = final_df['GDP per capita'] * final_df['Population']

# Remove % sign of Population under 20 years old column and convert it to be of type float
def transform(column_val):
    try:
        return float(column_val.replace(' %', ''))
    except:
        return column_val
    
final_df["Population under 20 years old in %"] = final_df["Population under20 years old"].apply(transform)
final_df = final_df.drop(columns = ["Population under20 years old"])

In [85]:
final_df.head()

Unnamed: 0,Country or region,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Population,Median agein years,Life expectancyin years,GDP,Population under 20 years old in %
0,Finland,1,7.809,1.285,1.5,0.961,0.662,0.16,0.478,5540720.0,42.3,81.9,7119825.0,21.4
1,Denmark,2,7.646,1.327,1.503,0.979,0.665,0.243,0.495,5792202.0,41.1,81.3,7686252.0,22.3
2,Switzerland,3,7.56,1.391,1.472,1.041,0.629,0.269,0.408,8654622.0,42.0,83.8,12038580.0,19.9
3,Iceland,4,7.504,1.327,1.548,1.001,0.662,0.362,0.145,341243.0,,,452829.5,
4,Norway,5,7.488,1.424,1.495,1.008,0.67,0.288,0.434,5421241.0,,,7719847.0,


In [86]:
final_df.shape

(153, 14)

## Finally, load our final dataset..

In [87]:
def load_dataset(dataset):
    dataset.to_csv("final_dataset.csv", index = False)

In [88]:
load_dataset(final_df)

# THE END..!!