### Employment

To-do's:
- Rename columns
- Drop first row
- Drop last two columns
- Check if the totals in Labor force participation and Unemployment rate are adding up male and female values.
- If so, drop the female + male rows.
- Check empty values.
- Reset index.

In [1]:
import pandas as pd
import numpy as np
import pycountry

df3 = pd.read_csv('../data/raw/SYB66_329_202310_Labour_unem.csv', encoding='iso-8859-1')
display(df3)

Unnamed: 0,T17,Labour force participation and unemployment,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Region/Country/Area,,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2005,Labour force participation - Total,62.9,Estimate.,"International Labour Organization (ILO), Genev..."
2,1,"Total, all countries or areas",2005,Unemployment rate - Total,6.3,Estimate.,"International Labour Organization (ILO), Genev..."
3,1,"Total, all countries or areas",2005,Labour force participation - Male,76.1,Estimate.,"International Labour Organization (ILO), Genev..."
4,1,"Total, all countries or areas",2005,Unemployment rate - Male,6.2,Estimate.,"International Labour Organization (ILO), Genev..."
...,...,...,...,...,...,...,...
5482,97,European Union (EU),2023,Unemployment rate - Total,6.3,Estimate.,"International Labour Organization (ILO), Genev..."
5483,97,European Union (EU),2023,Labour force participation - Male,63.9,Estimate.,"International Labour Organization (ILO), Genev..."
5484,97,European Union (EU),2023,Unemployment rate - Male,6.0,Estimate.,"International Labour Organization (ILO), Genev..."
5485,97,European Union (EU),2023,Labour force participation - Female,51.7,Estimate.,"International Labour Organization (ILO), Genev..."


In [2]:
print(df3.columns)

Index(['T17', 'Labour force participation and unemployment', 'Unnamed: 2',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'],
      dtype='object')


### Rows and Columns
- Drop row with index 0
- Drop last two columns
- Rename columns

In [3]:
df3 = df3.drop(0, errors='ignore')
df3 = df3.drop(['Unnamed: 5', 'Unnamed: 6'], axis=1)
display(df3)
print(df3.index)

Unnamed: 0,T17,Labour force participation and unemployment,Unnamed: 2,Unnamed: 3,Unnamed: 4
1,1,"Total, all countries or areas",2005,Labour force participation - Total,62.9
2,1,"Total, all countries or areas",2005,Unemployment rate - Total,6.3
3,1,"Total, all countries or areas",2005,Labour force participation - Male,76.1
4,1,"Total, all countries or areas",2005,Unemployment rate - Male,6.2
5,1,"Total, all countries or areas",2005,Labour force participation - Female,49.8
...,...,...,...,...,...
5482,97,European Union (EU),2023,Unemployment rate - Total,6.3
5483,97,European Union (EU),2023,Labour force participation - Male,63.9
5484,97,European Union (EU),2023,Unemployment rate - Male,6.0
5485,97,European Union (EU),2023,Labour force participation - Female,51.7


RangeIndex(start=1, stop=5487, step=1)


### Rename columns

In [4]:
df3 = df3.rename(columns= {
    'T17': 'country_code',
    'Labour force participation and unemployment': 'participation_area',
    'Unnamed: 2': 'year',
    'Unnamed: 3': 'statistic_type',
    'Unnamed: 4': 'statistic_value',
})

### Remove rows for gender specific statistic (the total equivalent for each year sums it up)

In [5]:
rows_to_drop = ['Labour force participation - Female', 'Labour force participation - Male', 'Unemployment rate - Female', 'Unemployment rate - Male']
df3 = df3[~df3['statistic_type'].isin(rows_to_drop)]

### Country codes
- Find unique values
- Use `pycountry` to find the equivalent names
- Add 'area' column with the names
- Identify the ones with 'Unknown' value comparing them with the 'participation_area'
- Create replacement dictionary, implement it

In [6]:
df3['participation_area'].unique()

array(['Total, all countries or areas', 'Africa', 'Northern Africa',
       'Sub-Saharan Africa', 'Eastern Africa', 'Middle Africa',
       'Southern Africa', 'Western Africa', 'Americas',
       'Northern America', 'Latin America & the Caribbean', 'Caribbean',
       'Central America', 'South America', 'Central Asia', 'Eastern Asia',
       'South-eastern Asia', 'Southern Asia', 'Western Asia', 'Caucasus',
       'Eastern Europe', 'Northern Europe', 'Southern Europe',
       'Western Europe', 'Oceania', 'Afghanistan', 'Albania', 'Algeria',
       'American Samoa', 'Angola', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia (Plurin. State of)',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman I

### Parse country codes, check for the area names with `pycountry`, update the dataframe

In [7]:
def transform_country_codes(code):
    if len(code) ==1:
        return '00' + code
    elif len(code) == 2:
        return '0' + code
    return code

def get_country_name(country_code):
    try:
        country = pycountry.countries.get(numeric=country_code) # check for country code
        if not country:
            country = pycountry.countries.get(alpha_2=country_code)
        return country.name if country else "Unknown"
    except KeyError:
        return "Unknown"

# implement logic to the actual dataset
df3['country_code'] = df3['country_code'].apply(transform_country_codes)
df3['area'] = df3['country_code'].apply(get_country_name)

df3.head(100)

Unnamed: 0,country_code,participation_area,year,statistic_type,statistic_value,area
1,001,"Total, all countries or areas",2005,Labour force participation - Total,62.9,Unknown
2,001,"Total, all countries or areas",2005,Unemployment rate - Total,6.3,Unknown
7,001,"Total, all countries or areas",2010,Labour force participation - Total,62.0,Unknown
8,001,"Total, all countries or areas",2010,Unemployment rate - Total,6.3,Unknown
13,001,"Total, all countries or areas",2015,Labour force participation - Total,60.7,Unknown
...,...,...,...,...,...,...
284,029,Caribbean,2023,Unemployment rate - Total,7.8,Unknown
289,013,Central America,2005,Labour force participation - Total,60.7,Unknown
290,013,Central America,2005,Unemployment rate - Total,4.0,Unknown
295,013,Central America,2010,Labour force participation - Total,60.7,Unknown


In [None]:
### Replace 'Unknown' values according to the dictionary

In [8]:
replace_dict = {
    "001": "World",
    "002": "Africa",
    "005": "South America",
    "009": "Oceania",
    "011": "Western Africa",
    "013": "Central America",
    "014": "Eastern Africa",
    "015": "Northern Africa",
    "017": "Middle Africa",
    "018": "Southern Africa",
    "019": "Americas",
    "021": "Northern America",
    "029": "Caribbean",
    "030": "Eastern Asia",
    "034": "Southern Asia",
    "035": "South-eastern Asia",
    "039": "Southern Europe",
    "097": "European Union",
    "134": "Caucasus",
    "143": "Central Asia",
    "145": "Western Asia",
    "151": "Eastern Europe",
    "154": "Northern Europe",
    "155": "Western Europe",
    "202": "Sub-Saharan Africa",
    "412": "Kosovo",
    "419": "Latin America and the Caribbean",
    "530": "Netherlands Antilles [former]",
    "830": "Channel Islands"
}

df3.loc[:,'area'] = df3['country_code'].map(replace_dict).fillna(df3['area'])
display(df3)  

Unnamed: 0,country_code,participation_area,year,statistic_type,statistic_value,area
1,001,"Total, all countries or areas",2005,Labour force participation - Total,62.9,World
2,001,"Total, all countries or areas",2005,Unemployment rate - Total,6.3,World
7,001,"Total, all countries or areas",2010,Labour force participation - Total,62.0,World
8,001,"Total, all countries or areas",2010,Unemployment rate - Total,6.3,World
13,001,"Total, all countries or areas",2015,Labour force participation - Total,60.7,World
...,...,...,...,...,...,...
5470,097,European Union (EU),2010,Unemployment rate - Total,9.8,European Union
5475,097,European Union (EU),2015,Labour force participation - Total,56.8,European Union
5476,097,European Union (EU),2015,Unemployment rate - Total,10.0,European Union
5481,097,European Union (EU),2023,Labour force participation - Total,57.6,European Union
