### Employment

To-do's:
- Rename columns
- Drop first row
- Drop last two columns
- Check if the totals in Labor force participation and Unemployment rate are adding up male and female values.
- If so, drop the female + male rows.
- Check empty values.
- Reset index.

In [1]:
import pandas as pd
import numpy as np
import pycountry

df3 = pd.read_csv('../data/raw/SYB66_329_202310_Labour_unem.csv', encoding='iso-8859-1')
display(df3)

Unnamed: 0,T17,Labour force participation and unemployment,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Region/Country/Area,,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",2005,Labour force participation - Total,62.9,Estimate.,"International Labour Organization (ILO), Genev..."
2,1,"Total, all countries or areas",2005,Unemployment rate - Total,6.3,Estimate.,"International Labour Organization (ILO), Genev..."
3,1,"Total, all countries or areas",2005,Labour force participation - Male,76.1,Estimate.,"International Labour Organization (ILO), Genev..."
4,1,"Total, all countries or areas",2005,Unemployment rate - Male,6.2,Estimate.,"International Labour Organization (ILO), Genev..."
...,...,...,...,...,...,...,...
5482,97,European Union (EU),2023,Unemployment rate - Total,6.3,Estimate.,"International Labour Organization (ILO), Genev..."
5483,97,European Union (EU),2023,Labour force participation - Male,63.9,Estimate.,"International Labour Organization (ILO), Genev..."
5484,97,European Union (EU),2023,Unemployment rate - Male,6.0,Estimate.,"International Labour Organization (ILO), Genev..."
5485,97,European Union (EU),2023,Labour force participation - Female,51.7,Estimate.,"International Labour Organization (ILO), Genev..."


In [2]:
print(df3.columns)

Index(['T17', 'Labour force participation and unemployment', 'Unnamed: 2',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'],
      dtype='object')


### Rows and Columns
- Drop row with index 0
- Drop last two columns
- Rename columns

In [3]:
df3 = df3.drop(0, errors='ignore')
df3 = df3.drop(['Unnamed: 5', 'Unnamed: 6'], axis=1)
display(df3)
print(df3.index)

Unnamed: 0,T17,Labour force participation and unemployment,Unnamed: 2,Unnamed: 3,Unnamed: 4
1,1,"Total, all countries or areas",2005,Labour force participation - Total,62.9
2,1,"Total, all countries or areas",2005,Unemployment rate - Total,6.3
3,1,"Total, all countries or areas",2005,Labour force participation - Male,76.1
4,1,"Total, all countries or areas",2005,Unemployment rate - Male,6.2
5,1,"Total, all countries or areas",2005,Labour force participation - Female,49.8
...,...,...,...,...,...
5482,97,European Union (EU),2023,Unemployment rate - Total,6.3
5483,97,European Union (EU),2023,Labour force participation - Male,63.9
5484,97,European Union (EU),2023,Unemployment rate - Male,6.0
5485,97,European Union (EU),2023,Labour force participation - Female,51.7


RangeIndex(start=1, stop=5487, step=1)


In [4]:
df3 = df3.rename(columns= {
    'T17': 'country_code',
    'Labour force participation and unemployment': 'participation_area',
    'Unnamed: 2': 'year',
    'Unnamed: 3': 'statistic_type',
    'Unnamed: 4': 'statistic_value',
})

### Remove rows for gender specific statistic (the total equivalent for each year sums it up)

In [5]:
rows_to_drop = ['Labour force participation - Female', 'Labour force participation - Male', 'Unemployment rate - Female', 'Unemployment rate - Male']
df3 = df3[~df3['statistic_type'].isin(rows_to_drop)]

### Country codes
- Find unique values
- Use `pycountry` to find the equivalent names
- Add 'area' column with the names
- Identify the ones with 'Unknown' value comparing them with the 'participation_area'
- Create replacement dictionary, implement it

In [6]:
df3['country_code'].unique()

array(['1', '2', '15', '202', '14', '17', '18', '11', '19', '21', '419',
       '29', '13', '5', '143', '30', '35', '34', '145', '134', '151',
       '154', '39', '155', '9', '4', '8', '12', '16', '24', '32', '51',
       '533', '36', '40', '31', '44', '48', '50', '52', '112', '56', '84',
       '204', '60', '64', '68', '70', '72', '76', '96', '100', '854',
       '108', '132', '116', '120', '124', '136', '140', '148', '830',
       '152', '156', '344', '446', '170', '174', '178', '184', '188',
       '384', '191', '192', '531', '196', '203', '408', '180', '208',
       '262', '214', '218', '818', '222', '226', '232', '233', '748',
       '231', '238', '234', '242', '246', '250', '254', '258', '266',
       '270', '268', '276', '288', '300', '304', '312', '316', '320',
       '324', '624', '328', '332', '340', '348', '352', '356', '360',
       '364', '368', '372', '833', '376', '380', '388', '392', '400',
       '398', '404', '296', '412', '414', '417', '418', '428', '422',
       '42

In [7]:
country_codes = ['1', '2', '15', '202', '14', '17', '18', '11', '19', '21', '419',
       '29', '13', '5', '143', '30', '35', '34', '145', '134', '151',
       '154', '39', '155', '9', '4', '8', '12', '16', '24', '32', '51',
       '533', '36', '40', '31', '44', '48', '50', '52', '112', '56', '84',
       '204', '60', '64', '68', '70', '72', '76', '96', '100', '854',
       '108', '132', '116', '120', '124', '136', '140', '148', '830',
       '152', '156', '344', '446', '170', '174', '178', '184', '188',
       '384', '191', '192', '531', '196', '203', '408', '180', '208',
       '262', '214', '218', '818', '222', '226', '232', '233', '748',
       '231', '238', '234', '242', '246', '250', '254', '258', '266',
       '270', '268', '276', '288', '300', '304', '312', '316', '320',
       '324', '624', '328', '332', '340', '348', '352', '356', '360',
       '364', '368', '372', '833', '376', '380', '388', '392', '400',
       '398', '404', '296', '412', '414', '417', '418', '428', '422',
       '426', '430', '434', '438', '440', '442', '450', '454', '458',
       '462', '466', '470', '584', '474', '478', '480', '484', '492',
       '496', '499', '500', '504', '508', '104', '516', '520', '524',
       '528', '530', '540', '554', '558', '562', '566', '570', '807',
       '580', '578', '512', '586', '585', '591', '598', '600', '604',
       '608', '616', '620', '630', '634', '410', '498', '638', '642',
       '643', '646', '659', '662', '670', '882', '674', '678', '682',
       '686', '688', '690', '694', '702', '703', '705', '90', '706',
       '710', '728', '724', '144', '275', '729', '740', '752', '756',
       '760', '762', '764', '626', '768', '776', '780', '788', '792',
       '795', '796', '800', '804', '784', '826', '834', '840', '850',
       '858', '860', '548', '862', '704', '732', '887', '894', '716',
       '97']

def transform_country_codes(code):
    if len(code) ==1:
        return '00' + code
    elif len(code) == 2:
        return '0' + code
    return code

country_codes = [transform_country_codes(code) for code in country_codes]

def get_country_name(country_code):
    try:
        # Check for the country code in pycountry
        country = pycountry.countries.get(numeric=country_code)
        if not country:
            country = pycountry.countries.get(alpha_2=country_code)
        return country.name if country else "Unknown"
    except KeyError:
        return "Unknown"

# Convert country codes to country names
country_names = [get_country_name(code) for code in country_codes]

# Print the country names
for code, name in zip(country_codes, country_names):
    print(f"Country Code: {code} -> Country Name: {name}")


Country Code: 001 -> Country Name: Unknown
Country Code: 002 -> Country Name: Unknown
Country Code: 015 -> Country Name: Unknown
Country Code: 202 -> Country Name: Unknown
Country Code: 014 -> Country Name: Unknown
Country Code: 017 -> Country Name: Unknown
Country Code: 018 -> Country Name: Unknown
Country Code: 011 -> Country Name: Unknown
Country Code: 019 -> Country Name: Unknown
Country Code: 021 -> Country Name: Unknown
Country Code: 419 -> Country Name: Unknown
Country Code: 029 -> Country Name: Unknown
Country Code: 013 -> Country Name: Unknown
Country Code: 005 -> Country Name: Unknown
Country Code: 143 -> Country Name: Unknown
Country Code: 030 -> Country Name: Unknown
Country Code: 035 -> Country Name: Unknown
Country Code: 034 -> Country Name: Unknown
Country Code: 145 -> Country Name: Unknown
Country Code: 134 -> Country Name: Unknown
Country Code: 151 -> Country Name: Unknown
Country Code: 154 -> Country Name: Unknown
Country Code: 039 -> Country Name: Unknown
Country Cod

In [8]:
# try it on the country column

df3['country_code'] = df3['country_code'].apply(transform_country_codes)
df3['area'] = df3['country_code'].apply(get_country_name)

df3.tail(100)

Unnamed: 0,country_code,participation_area,year,statistic_type,statistic_value,area
5187,834,United Rep. of Tanzania,2015,Labour force participation - Total,84.0,"Tanzania, United Republic of"
5188,834,United Rep. of Tanzania,2015,Unemployment rate - Total,2.1,"Tanzania, United Republic of"
5193,834,United Rep. of Tanzania,2023,Labour force participation - Total,82.9,"Tanzania, United Republic of"
5194,834,United Rep. of Tanzania,2023,Unemployment rate - Total,2.9,"Tanzania, United Republic of"
5199,840,United States of America,2005,Labour force participation - Total,64.8,United States
...,...,...,...,...,...,...
5470,097,European Union (EU),2010,Unemployment rate - Total,9.8,Unknown
5475,097,European Union (EU),2015,Labour force participation - Total,56.8,Unknown
5476,097,European Union (EU),2015,Unemployment rate - Total,10.0,Unknown
5481,097,European Union (EU),2023,Labour force participation - Total,57.6,Unknown


In [9]:
unknown_countries_df = df3[df3['area'] == 'Unknown']

# Print the filtered DataFrame
display(unknown_countries_df.head(13))
unknown_countries_df['country_code'].unique()

Unnamed: 0,country_code,participation_area,year,statistic_type,statistic_value,area
1,1,"Total, all countries or areas",2005,Labour force participation - Total,62.9,Unknown
2,1,"Total, all countries or areas",2005,Unemployment rate - Total,6.3,Unknown
7,1,"Total, all countries or areas",2010,Labour force participation - Total,62.0,Unknown
8,1,"Total, all countries or areas",2010,Unemployment rate - Total,6.3,Unknown
13,1,"Total, all countries or areas",2015,Labour force participation - Total,60.7,Unknown
14,1,"Total, all countries or areas",2015,Unemployment rate - Total,6.0,Unknown
19,1,"Total, all countries or areas",2023,Labour force participation - Total,59.7,Unknown
20,1,"Total, all countries or areas",2023,Unemployment rate - Total,5.8,Unknown
25,2,Africa,2005,Labour force participation - Total,64.4,Unknown
26,2,Africa,2005,Unemployment rate - Total,6.6,Unknown


array(['001', '002', '015', '202', '014', '017', '018', '011', '019',
       '021', '419', '029', '013', '005', '143', '030', '035', '034',
       '145', '134', '151', '154', '039', '155', '009', '830', '412',
       '530', '097'], dtype=object)

In [10]:
# unknown_countries_df[unknown_countries_df['country_code'] == '015']

# Group by 'country_code' and display the 'participation_area' for each unique 'country_code'
grouped = unknown_countries_df.groupby('country_code')['participation_area'].apply(list).reset_index()

# Display the result
print(grouped)

   country_code                                 participation_area
0           001  [Total, all countries or areas, Total, all cou...
1           002  [Africa, Africa, Africa, Africa, Africa, Afric...
2           005  [South America, South America, South America, ...
3           009  [Oceania, Oceania, Oceania, Oceania, Oceania, ...
4           011  [Western Africa, Western Africa, Western Afric...
5           013  [Central America, Central America, Central Ame...
6           014  [Eastern Africa, Eastern Africa, Eastern Afric...
7           015  [Northern Africa, Northern Africa, Northern Af...
8           017  [Middle Africa, Middle Africa, Middle Africa, ...
9           018  [Southern Africa, Southern Africa, Southern Af...
10          019  [Americas, Americas, Americas, Americas, Ameri...
11          021  [Northern America, Northern America, Northern ...
12          029  [Caribbean, Caribbean, Caribbean, Caribbean, C...
13          030  [Eastern Asia, Eastern Asia, Eastern Asia, Ea

In [11]:
replace_dict = {
    "001": "Total, all countries or areas",
    "002": "Africa",
    "005": "South America",
    "009": "Oceania",
    "011": "Western Africa",
    "013": "Central America",
    "014": "Eastern Africa",
    "015": "Northern Africa",
    "017": "Middle Africa",
    "018": "Southern Africa",
    "019": "Americas",
    "021": "Northern America",
    "029": "Caribbean",
    "030": "Eastern Asia",
    "034": "Southern Asia",
    "035": "South-eastern Asia",
    "039": "Southern Europe",
    "097": "European Union (EU)",
    "134": "Caucasus",
    "143": "Central Asia",
    "145": "Western Asia",
    "151": "Eastern Europe",
    "154": "Northern Europe",
    "155": "Western Europe",
    "202": "Sub-Saharan Africa",
    "412": "Kosovo",
    "419": "Latin America & the Caribbean",
    "530": "Netherlands Antilles [former]",
    "830": "Channel Islands"
}
