### Employment

To-do's:
- Rename columns
- Drop first row
- Drop last two columns
- Check if the totals in Labor force participation and Unemployment rate are adding up male and female values.
- If so, drop the female + male rows.
- Check empty values.
- Reset index.

In [None]:
import pandas as pd
import numpy as np
import pycountry

employment_df = pd.read_csv('../data/raw/SYB66_329_202310_Labour_unem.csv', encoding='iso-8859-1')
display(employment_df)

In [None]:
print(employment_df.columns)

### Rows and Columns
- Drop row with index 0
- Drop last two columns
- Rename columns

In [None]:
employment_df = employment_df.drop(0, errors='ignore')
employment_df = employment_df.drop(['Unnamed: 5', 'Unnamed: 6'], axis=1)
display(employment_df)
print(employment_df.index)

### Rename columns

In [None]:
employment_df = employment_df.rename(columns= {
    'T17': 'country_code',
    'Labour force participation and unemployment': 'participation_area',
    'Unnamed: 2': 'year',
    'Unnamed: 3': 'statistic_type',
    'Unnamed: 4': 'statistic_value',
})

### Remove rows for gender specific statistic (the total equivalent for each year sums it up)

In [None]:
rows_to_drop = ['Labour force participation - Female', 'Labour force participation - Male', 'Unemployment rate - Female', 'Unemployment rate - Male']
employment_df = employment_df[~employment_df['statistic_type'].isin(rows_to_drop)]

### Country codes
- Find unique values
- Use `pycountry` to find the equivalent names
- Add 'area' column with the names
- Identify the ones with 'Unknown' value comparing them with the 'participation_area'
- Create replacement dictionary, implement it

In [None]:
print('\nUnique values in participation_area column:\n')
employment_df['participation_area'].unique()

### Parse country codes, check for the area names with `pycountry`, update the dataframe

In [None]:
def transform_country_codes(code):
    if len(code) ==1:
        return '00' + code
    elif len(code) == 2:
        return '0' + code
    return code

def get_country_name(country_code):
    try:
        country = pycountry.countries.get(numeric=country_code) # check for country code
        if not country:
            country = pycountry.countries.get(alpha_2=country_code)
        return country.name if country else 'Unknown'
    except KeyError:
        return 'Unknown'

# implement logic to the actual dataset
employment_df['country_code'] = employment_df['country_code'].apply(transform_country_codes)
employment_df['area'] = employment_df['country_code'].apply(get_country_name)

employment_df.head(100)

### Replace 'Unknown' values according to the dictionary

In [None]:
replace_dict = {
    '001': 'World',
    '002': 'Africa',
    '005': 'South America',
    '009': 'Oceania',
    '011': 'Western Africa',
    '013': 'Central America',
    '014': 'Eastern Africa',
    '015': 'Northern Africa',
    '017': 'Middle Africa',
    '018': 'Southern Africa',
    '019': 'Americas',
    '021': 'Northern America',
    '029': 'Caribbean',
    '030': 'Eastern Asia',
    '034': 'Southern Asia',
    '035': 'South-eastern Asia',
    '039': 'Southern Europe',
    '097': 'European Union',
    '134': 'Caucasus',
    '143': 'Central Asia',
    '145': 'Western Asia',
    '151': 'Eastern Europe',
    '154': 'Northern Europe',
    '155': 'Western Europe',
    '202': 'Sub-Saharan Africa',
    '412': 'Kosovo',
    '419': 'Latin America and the Caribbean',
    '530': 'Netherlands Antilles [former]',
    '830': 'Channel Islands'
}

employment_df.loc[:,'area'] = employment_df['country_code'].map(replace_dict).fillna(employment_df['area'])
# display(employment_df)

In [None]:
print('\nUnique values in area column:\n')
print(employment_df['area'].unique())

### Convert the following values in 'area' for better readability and standardization purposes

In [None]:
replace_dict_area = {
    'Bolivia, Plurinational State of': 'Bolivia',
    'Brunei Darussalam': 'Brunei',
    'Congo, The Democratic Republic of the': 'Democratic Republic of the Congo', 
    "Côte d'Ivoire": 'Ivory Coast',
    'Falkland Islands (Malvinas)': 'Falkland Islands',
    'Iran (Islamic Republic of)': 'Iran',
    "Korea, Democratic People's Republic of": 'North Korea',
    'Korea, Republic of': 'South Korea',
    "Lao People's Democratic Republic": 'Laos',
    'Moldova, Republic of': 'Moldova',
    'Netherlands Antilles [former]': 'Netherlands Antilles',
    'Republic of Korea': 'South Korea',
    'Republic of Moldova': 'Moldova',
    'Russian Federation': 'Russia',
    'Timor-Leste': 'East Timor',
    'Türkiye': 'Turkey',
    'Venezuela, Bolivarian Republic of': 'Venezuela'
}

employment_df['area'] = employment_df['area'].replace(replace_dict_area)

In [None]:
replace_dict_statistic = {
    'Labour force participation - Total': 'Labour force participation',
    'Unemployment rate - Total': 'Unemployment rate'
}

employment_df['statistic_type'] = employment_df['statistic_type'].replace(replace_dict_statistic)

### Last but not least
- Drop 'participation_area' column
- Reorder columns
- Reset index

In [None]:
employment_df = employment_df.drop(['participation_area'], axis=1)

custom_column_order = ['country_code', 'area', 'year', 'statistic_type', 'statistic_value']
employment_df = employment_df[custom_column_order]

employment_df.reset_index(drop=True, inplace=True)
display(employment_df)