In [2]:
import pandas as pd
import numpy as np
from functools import reduce
import difflib
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor
import warnings; warnings.filterwarnings("ignore")

## Functions

In [100]:
def merge_country_observations_old(df, name1_list, name2_list):
    now = datetime.now()
    print(name1_list, name2_list)
    numeric_columns = df.select_dtypes(include=['number']).columns
    for column in numeric_columns:
        for year in df['year'].unique():
            mean_values = df[(df['country'].isin(name2_list)) & (df['year'] == year)][column].mean()
            for country in name1_list:
                df.loc[(df['country'] == country) & (df['year'] == year), column] = df.loc[(df['country'] == country) & (df['year'] == year), column].fillna(mean_values)
    df = df.drop(df[df['country'].isin(name2_list)].index)
    then = datetime.now()
    print('Elapsed time:', (then - now).total_seconds() / 60, 'minutes')
    return df

def merge_country_observations(df, name1_list, name2_list):
    now = datetime.now()
    print(name1_list, name2_list)
    numeric_columns = df.select_dtypes(include=['number']).columns
    mean_values_by_year = df[df['country'].isin(name2_list)].groupby('year')[numeric_columns].mean()
    for country in name1_list:
        for column in numeric_columns:
            df.loc[df['country'] == country, column] = df.loc[df['country'] == country].apply(
                lambda row: row[column] 
                if pd.notna(row[column]) 
                else mean_values_by_year.loc[row['year'], column] 
                if row['year'] in mean_values_by_year.index else np.nan,  # Handle missing years
                axis=1)
    df = df[~df['country'].isin(name2_list)]
    then = datetime.now()
    print('Elapsed time:', (then - now).total_seconds(), 'seconds')
    return df

    
def standardize_dataframe(df, target, exclude=None):
    exclude = exclude or []
    for col in df.columns:
        if col != target and col not in exclude and pd.api.types.is_numeric_dtype(df[col]):
            m, s = df[col].mean(), df[col].std()
            df[col] = (df[col] - m) / s if s != 0 else 0
    return df

# Load, merge and clean

## Load

### Corruption perceptions index (only 2022 tho)
ISO3

In [2]:
cpi = pd.read_excel("IQD/CPI.xlsx", header=2)
cpi.columns = (
    cpi.columns.str.strip()      
    .str.lower()             
    .str.replace(' ', '_')           
    .str.replace('[^a-z0-9_]', ''))
cpi = cpi.rename(columns={'country_/_territory':'country'})
cpi['year'] = 2022
cpi

Unnamed: 0,country,iso3,region,cpi_score_2022,rank,standard_error,number_of_sources,lower_ci,upper_ci,african_development_bank_cpia,...,freedom_house_nations_in_transit,global_insights_country_risk_ratings,imd_world_competitiveness_yearbook,perc_asia_risk_guide,prs_international_country_risk_guide,varieties_of_democracy_project,world_bank_cpia,world_economic_forum_eos,world_justice_project_rule_of_law_index,year
0,Afghanistan,AFG,AP,24,150,6.299999,4,13.66800,34.33200,,...,,10,,,,43.0,,,16.0,2022
1,Albania,ALB,ECA,36,101,1.316355,8,33.84118,38.15882,,...,42.0,35,,,32.0,27.0,,42.0,29.0,2022
2,Algeria,DZA,MENA,33,116,1.673809,6,30.25495,35.74504,,...,,35,,,24.0,29.0,,,36.0,2022
3,Angola,AGO,SSA,33,116,2.807703,7,28.39537,37.60463,,...,,47,,,32.0,46.0,,22.0,28.0,2022
4,Argentina,ARG,AME,38,94,2.334808,7,34.17091,41.82909,,...,,35,24.0,,32.0,47.0,,,44.0,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,Venezuela,VEN,AME,14,177,1.296945,8,11.87301,16.12699,,...,,10,24.0,,15.0,9.0,,13.0,9.0,2022
176,Vietnam,VNM,AP,42,77,1.497007,8,39.54491,44.45509,,...,,47,,39.0,41.0,44.0,,55.0,36.0,2022
177,Yemen,YEM,MENA,16,176,1.975074,7,12.76088,19.23912,,...,,10,,,15.0,11.0,10.0,30.0,,2022
178,Zambia,ZMB,SSA,33,116,1.495884,9,30.54675,35.45325,34.0,...,,35,,,32.0,49.0,27.0,30.0,36.0,2022


### Economic freedom in the world
ISO2, ISO3

In [3]:
efw = pd.read_excel("IQD/EFW.xlsx", header=4, usecols=lambda col: col != 'Unnamed: 0')
efw.columns = (
    efw.columns.str.strip()      
    .str.lower()             
    .str.replace(' ', '_')           
    .str.replace('[^a-z0-9_]', ''))
efw = efw.rename(columns={'countries':'country'})
efw

Unnamed: 0,year,iso_code_2,iso_code_3,country,economic_freedom_summary_index,rank,quartile,1a_government_consumption,data,1b__transfers_and_subsidies,...,5civ_tax_compliance,5c__business_regulations,5di__market_openness,5dii_business_permits,5diii_distorton_of_the_business_environment,5d_freedom_to_enter_markets_and_compete,5__regulation,area_5_rank,world_bank_region,"world_bank_current_income_classification,_1990-present"
0,2021,AL,ALB,Albania,7.60,31.0,1.0,7.758824,13.620000,7.133515,...,3.587625,5.312221,6.810619,5.621940,6.250000,6.227520,6.972552,51.0,Europe & Central Asia,UM
1,2021,DZ,DZA,Algeria,4.82,157.0,4.0,3.467647,28.210000,7.817129,...,4.764764,3.735188,4.412430,8.771111,3.134281,5.439274,4.836754,150.0,Middle East & North Africa,LM
2,2021,AO,AGO,Angola,5.38,149.0,4.0,7.179412,15.590000,9.702997,...,4.641462,4.285256,3.099164,7.916416,1.250000,4.088527,4.730618,153.0,Sub-Saharan Africa,LM
3,2021,AR,ARG,Argentina,4.77,158.0,4.0,5.691176,20.650000,5.536785,...,4.504147,4.862998,6.144822,5.726521,2.922359,4.931234,5.299677,143.0,Latin America & the Caribbean,UM
4,2021,AM,ARM,Armenia,7.58,33.0,1.0,6.473529,17.990000,7.316076,...,6.020369,5.556860,5.590883,9.302574,5.000000,6.631152,6.939219,52.0,Europe & Central Asia,UM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4615,1970,VE,VEN,"Venezuela, RB",7.29,14.0,1.0,6.602003,17.553191,9.827430,...,,6.335000,,,,,6.545577,35.0,Latin America & the Caribbean,
4616,1970,VN,VNM,Vietnam,,,,,,,...,,2.550000,,,,,,,East Asia & Pacific,
4617,1970,YE,YEM,"Yemen, Rep.",,,,,,,...,,5.017500,,,,,,,Middle East & North Africa,
4618,1970,ZM,ZMB,Zambia,5.26,61.0,3.0,3.448131,28.276353,9.105430,...,,6.067500,,,,,6.697414,32.0,Sub-Saharan Africa,


### Freedom in the world

In [4]:
# Load
fiw1 = pd.read_excel("IQD/FIW.xlsx", sheet_name='FIW06-23').rename(columns={'country/territory':'country'})
fiw1.columns = (
    fiw1.columns.str.strip()      
    .str.lower()             
    .str.replace(' ', '_')           
    .str.replace('[^a-z0-9_]', ''))
fiw1 = fiw1.rename(columns={'country/territory':'country'})

fiw2 = pd.read_excel("IQD/FIW.xlsx", sheet_name='FIW03-05')
fiw2.columns = (
    fiw2.columns.str.strip()      
    .str.lower()             
    .str.replace(' ', '_')           
    .str.replace('[^a-z0-9_]', ''))
fiw2 = fiw2.rename(columns={'country/territory':'country'})

In [5]:
# Adjust fiw2 format
years = ['03', '04', '05']
rows = []
for index, row in fiw2.iterrows():
    for year in years:
        # Create a new row for each year
        new_row = {
            'country': row['country'],
            'region': np.nan,  
            'c/t?': row['c/t?'],
            'edition': int('20' + year),
            'status': np.nan,  
            'pr_rating': np.nan,
            'cl_rating': np.nan,
            'a': np.nan,  
            'b': np.nan,  
            'c': np.nan, 
            'add_q': np.nan,  
            'add_a': np.nan,  
            'pr': row[f'fiw{year}_pr'],
            'd': np.nan,  
            'e': np.nan,  
            'f': np.nan,  
            'g': np.nan,  
            'cl': row[f'fiw{year}_cl'],
            'total': row[f'fiw{year}_total'],}
        rows.append(new_row)
fiw2 = pd.DataFrame(rows)

In [6]:
# concat
fiw = pd.concat([fiw1, fiw2], ignore_index=True)
fiw = fiw.rename(columns={'edition':'year'})
fiw = fiw.sort_values(by=['country', 'year']).reset_index(drop=True)
fiw

Unnamed: 0,country,region,c/t?,year,status,pr_rating,cl_rating,a,b,c,add_q,add_a,pr,d,e,f,g,cl,total
0,Abkhazia,,t,2003,,,,,,,,,9,,,,,20,29
1,Abkhazia,,t,2004,,,,,,,,,9,,,,,21,30
2,Abkhazia,,t,2005,,,,,,,,,9,,,,,21,30
3,Abkhazia,Eurasia,t,2006,PF,5.0,5.0,5.0,5.0,3.0,0.0,0.0,13,8.0,4.0,4.0,5.0,21,34
4,Abkhazia,Eurasia,t,2007,PF,5.0,5.0,5.0,5.0,3.0,0.0,0.0,13,8.0,4.0,4.0,5.0,21,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4375,Zimbabwe,Africa,c,2019,PF,5.0,5.0,3.0,6.0,3.0,0.0,,12,8.0,4.0,2.0,5.0,19,31
4376,Zimbabwe,Africa,c,2020,PF,5.0,5.0,3.0,6.0,3.0,0.0,,12,7.0,3.0,2.0,5.0,17,29
4377,Zimbabwe,Africa,c,2021,NF,6.0,5.0,3.0,5.0,3.0,0.0,,11,7.0,3.0,2.0,5.0,17,28
4378,Zimbabwe,Africa,c,2022,NF,6.0,5.0,3.0,5.0,3.0,0.0,,11,7.0,3.0,2.0,5.0,17,28


### Index of economic freedom
ISO Code

In [7]:
ief = pd.read_csv("IQD/IEF.csv", usecols=lambda col: col != 'Id')
ief.columns = (
    ief.columns.str.strip()      
    .str.lower()             
    .str.replace(' ', '_')           
    .str.replace('[^a-z0-9_]', ''))
ief = ief.rename(columns={'name':'country',
                          'index_year':'year'})
ief = ief.sort_values(by=['country', 'year']).reset_index(drop=True)
ief

Unnamed: 0,country,short_name,iso_code,year,overall_score,property_rights,judicial_effectiveness,government_integrity,tax_burden,government_spending,fiscal_health,business_freedom,labor_freedom,monetary_freedom,trade_freedom,investment_freedom,financial_freedom
0,Afghanistan,Afghanistan,AF,1995,,,,,,,,,,,,,
1,Afghanistan,Afghanistan,AF,1996,,,,,,,,,,,,,
2,Afghanistan,Afghanistan,AF,1997,,,,,,,,,,,,,
3,Afghanistan,Afghanistan,AF,1998,,,,,,,,,,,,,
4,Afghanistan,Afghanistan,AF,1999,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5334,Zimbabwe,Zimbabwe,ZW,2019,40.4,29.7,24.8,15.8,62.3,74.5,23.7,33.4,43.3,72.4,70.0,25.0,10.0
5335,Zimbabwe,Zimbabwe,ZW,2020,43.1,33.7,28.3,18.9,62.3,88.0,36.2,39.1,43.1,62.8,70.0,25.0,10.0
5336,Zimbabwe,Zimbabwe,ZW,2021,39.5,32.6,32.0,21.3,63.4,89.4,58.8,41.5,44.1,0.0,56.0,25.0,10.0
5337,Zimbabwe,Zimbabwe,ZW,2022,33.1,20.8,16.3,20.6,66.1,87.2,79.8,36.9,34.9,0.1,0.1,25.0,10.0


### Polity 5D

In [8]:
p5d = pd.read_excel("IQD/P5D.xls")
p5d = p5d[p5d.year>=1960].reset_index(drop=True)
p5d

Unnamed: 0,p5,cyear,ccode,scode,country,year,flag,fragment,democ,autoc,...,interim,bmonth,bday,byear,bprec,post,change,d5,sf,regtrans
0,0,7001960,700,AFG,Afghanistan,1960,0,,0,10,...,,,,,,,,,,
1,0,7001961,700,AFG,Afghanistan,1961,0,,0,10,...,,,,,,,,,,
2,0,7001962,700,AFG,Afghanistan,1962,0,,0,10,...,,,,,,,,,,
3,0,7001963,700,AFG,Afghanistan,1963,0,,0,10,...,,,,,,,,,,
4,0,7001964,700,AFG,Afghanistan,1964,0,,0,7,...,,9.0,10.0,1964.0,1.0,-7.0,3.0,1.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,1,5522014,552,ZIM,Zimbabwe,2014,0,0.0,5,1,...,,,,,,,,,,
8796,1,5522015,552,ZIM,Zimbabwe,2015,0,0.0,5,1,...,,,,,,,,,,
8797,1,5522016,552,ZIM,Zimbabwe,2016,0,0.0,5,1,...,,,,,,,,,,
8798,1,5522017,552,ZIM,Zimbabwe,2017,0,0.0,5,1,...,,,,,,,,,,


### Political terror scale

In [9]:
pts = pd.read_excel("IQD/PTS.xlsx")
pts.columns = (
    pts.columns.str.strip()      
    .str.lower()             
    .str.replace(' ', '_')           
    .str.replace('[^a-z0-9_]', ''))
pts

Unnamed: 0,country,country_old,year,cow_code_a,cow_code_n,wordbank_code_a,un_code_n,region,pts_a,pts_h,pts_s,na_status_a,na_status_h,na_status_s
0,Afghanistan,Afghanistan,1976,AFG,700.0,AFG,4.0,sa,2.0,,2.0,0,88,0
1,Afghanistan,Afghanistan,1977,AFG,700.0,AFG,4.0,sa,4.0,,2.0,0,88,0
2,Afghanistan,Afghanistan,1978,AFG,700.0,AFG,4.0,sa,4.0,,3.0,0,88,0
3,Afghanistan,Afghanistan,1979,AFG,700.0,AFG,4.0,sa,5.0,,5.0,0,88,0
4,Afghanistan,Afghanistan,1980,AFG,700.0,AFG,4.0,sa,5.0,,5.0,0,88,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10002,Zimbabwe,Zimbabwe,2018,ZIM,552.0,ZWE,716.0,ssa,,3.0,3.0,88,0,0
10003,Zimbabwe,Zimbabwe,2019,ZIM,552.0,ZWE,716.0,ssa,3.0,4.0,3.0,0,0,0
10004,Zimbabwe,Zimbabwe,2020,ZIM,552.0,ZWE,716.0,ssa,3.0,4.0,3.0,0,0,0
10005,Zimbabwe,Zimbabwe,2021,ZIM,552.0,ZWE,716.0,ssa,3.0,3.0,3.0,0,0,0


### World governance indicators

In [10]:
def prep_wgi(sheet_name):
    # Load
    df = pd.read_excel("IQD/WGI.xlsx", sheet_name=sheet_name, header=[13, 14])
    
    # Prep
    df.columns = [' '.join(map(str, col)).strip() for col in df.columns.values]
    df = df.rename(columns={'Unnamed: 0_level_0 Country/Territory':'country',
                            'Unnamed: 1_level_0 Code':'code'})
    df_long = pd.melt(df, id_vars=['country', 'code'], var_name='Year_Metric', value_name='Value')
    df_long['Year'] = df_long['Year_Metric'].str.extract('(\d+)')[0]  # Extract year part
    df_long['Metric'] = df_long['Year_Metric'].str.split(' ').str[1]  # Extract metric name
    df_long = df_long.drop(columns=['Year_Metric'])
    df_long = df_long[['country', 'code', 'Year', 'Metric', 'Value']]
    
    # Pivot
    df_wide = df_long.pivot_table(
        index=['country', 'code', 'Year'],  # Columns to keep, not to spread
        columns='Metric',  # Column to spread
        values='Value',  # Fill values
        aggfunc='first'  # Function to aggregate values, 'first' because each combination is unique
    )
    df_wide.reset_index(inplace=True)
    df_wide.columns.name = None
    df_wide.rename(columns={col: f"{sheet_name}_{col}" for col in df_wide.columns if col not in ['country', 'code', 'Year']}, inplace=True)
    return df_wide

In [11]:
# Load
wgi1 = prep_wgi(sheet_name="VoiceandAccountability")
wgi2 = prep_wgi(sheet_name="Political StabilityNoViolence")
wgi3 = prep_wgi(sheet_name="GovernmentEffectiveness")
wgi4 = prep_wgi(sheet_name="RegulatoryQuality")
wgi5 = prep_wgi(sheet_name="RuleofLaw")
wgi6 = prep_wgi(sheet_name="ControlofCorruption")

# Merge
dataframes = [wgi1, wgi2, wgi3, wgi4, wgi5, wgi6]
wgi = reduce(lambda left, right: pd.merge(left, right, on=['country', 'code', 'Year'], how='left'), dataframes)

In [12]:
# Final
wgi.columns = (
    wgi.columns.str.strip()      
    .str.lower()             
    .str.replace(' ', '_')           
    .str.replace('[^a-z0-9_]', ''))
wgi = wgi.rename(columns={'Year':'year'})
wgi = wgi.sort_values(by=['country', 'year']).reset_index(drop=True)
wgi

Unnamed: 0,country,code,year,voiceandaccountability_estimate,voiceandaccountability_lower,voiceandaccountability_numsrc,voiceandaccountability_rank,voiceandaccountability_stderr,voiceandaccountability_upper,political_stabilitynoviolence_estimate,...,ruleoflaw_numsrc,ruleoflaw_rank,ruleoflaw_stderr,ruleoflaw_upper,controlofcorruption_estimate,controlofcorruption_lower,controlofcorruption_numsrc,controlofcorruption_rank,controlofcorruption_stderr,controlofcorruption_upper
0,Afghanistan,AFG,1996,-1.908540,0.000000,4.0,1.000000,0.261457,9.500000,-2.417310,...,3.0,1.507538,0.350509,12.562814,-1.291705,0.000000,2.0,4.301075,0.340507,27.419355
1,Afghanistan,AFG,1998,-2.039301,0.000000,4.0,0.497512,0.256090,4.975124,-2.427355,...,3.0,2.000000,0.327277,11.500000,-1.176012,0.000000,2.0,8.021390,0.324013,33.689838
2,Afghanistan,AFG,2000,-2.031417,0.000000,4.0,0.995025,0.254043,5.472637,-2.438969,...,3.0,1.492537,0.291758,9.452736,-1.271724,0.000000,2.0,4.787234,0.346906,30.851065
3,Afghanistan,AFG,2002,-1.433421,2.985075,6.0,9.452736,0.189453,14.925373,-2.035034,...,4.0,1.990050,0.296793,11.442786,-1.251137,0.000000,2.0,4.761905,0.352838,32.804234
4,Afghanistan,AFG,2003,-1.177571,6.965174,5.0,14.427860,0.207045,24.378109,-2.198372,...,4.0,2.985075,0.300880,16.417910,-1.344180,0.000000,3.0,4.761905,0.270215,19.047619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4970,Zimbabwe,ZWE,2018,-1.136798,12.135922,15.0,16.990292,0.123724,21.844660,-0.721038,...,15.0,8.095238,0.138192,14.285714,-1.246001,6.190476,15.0,10.000000,0.125007,13.809524
4971,Zimbabwe,ZWE,2019,-1.163669,12.077294,14.0,16.425121,0.117815,21.256039,-0.943286,...,15.0,7.142857,0.140278,13.809524,-1.271190,4.761905,15.0,10.000000,0.133336,14.285714
4972,Zimbabwe,ZWE,2020,-1.113408,13.043478,13.0,17.874395,0.120230,23.188406,-1.052728,...,13.0,7.619048,0.139408,12.857142,-1.287992,4.285714,13.0,10.000000,0.140076,16.190475
4973,Zimbabwe,ZWE,2021,-1.135830,12.560387,13.0,18.357489,0.120300,22.705315,-0.954426,...,13.0,8.571428,0.146504,15.714286,-1.253550,4.285714,12.0,10.000000,0.151591,17.619047


### World bank data

In [13]:
# Load
wb = pd.read_csv("nasdaq/WB.csv")

# Pre-process
wb = wb[wb.year.notnull()]
wb['year'] = wb['year'].astype(int)
wb = wb.rename(columns={"series_id": "indicator"})
wb['country_name'] = wb['country_name'].replace('Czechia', 'Czech Republic')
wb['country_name'] = wb['country_name'].replace("Korea, Dem. Peopleâ\x80\x99s Rep.", "Korea, Dem. People's Rep.")

# Pivot
wb = wb.pivot_table(index=['country_code', 'country_name', 'year'], columns='indicator', values='value').reset_index()

# Other fixes
wb.columns.name = None
wb_prep = wb.rename(columns={'country_name':'country'})

# Drop cols with more than 80 percent NaNs
threshold = 0.2 * len(wb_prep)
wb = wb_prep.dropna(axis=1, thresh=threshold)
print("We have dropped", len(wb_prep.columns)-len(wb.columns),"columns with more than 80 percent NaN observations -", len(wb.columns), "remain.")

We have dropped 529 columns with more than 80 percent NaN observations - 257 remain.


## Merge

### CPI + EFW

In [14]:
set(cpi.columns).intersection(efw.columns)

{'country', 'rank', 'year'}

In [15]:
cpi = cpi.rename(columns={'rank':'cpi_rank'})
efw = efw.rename(columns={'rank':'efw_rank'})

In [16]:
country_mapping = {
    'Bahamas': 'Bahamas, The',
    'Cote d\'Ivoire': "Côte d'Ivoire",
    'Guinea Bissau': 'Guinea-Bissau',
    'Korea, North': 'North Korea',
    'Korea, South': 'South Korea',
    'United States of America': 'United States',
    'Russia': 'Russian Federation',
    'Sao Tome and Principe': 'São Tomé and Príncipe',
    'Gambia': 'Gambia, The',
    'Hong Kong': 'Hong Kong SAR, China',
    'Iran': 'Iran, Islamic Rep.',
    'Egypt': 'Egypt, Arab Rep.',
    'Laos': 'Lao PDR',
    'Venezuela': 'Venezuela, RB',
    'Syria': 'Syrian Arab Republic',
    'Turkey': 'Türkiye',
    'Yemen': 'Yemen, Rep.',
    'Congo': 'Congo, Rep.',
    'Democratic Republic of the Congo': 'Congo, Dem. Rep.',
    'Korea, Rep.': 'South Korea',
    'Kyrgyz Republic': 'Kyrgyzstan',
    'Slovak Republic': 'Slovakia'
}

cpi['country'] = cpi['country'].replace(country_mapping)
efw['country'] = efw['country'].replace(country_mapping)

df1 = pd.merge(efw, cpi, on=['country', 'year'], how='outer')

def check_countries(column, min_length=4):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df1['country'].unique())

[('Albania', 'Lithuania', 'ania'),
 ('Albania', 'Mauritania', 'ania'),
 ('Albania', 'Romania', 'ania'),
 ('Albania', 'Tanzania', 'ania'),
 ('Algeria', 'Liberia', 'eria'),
 ('Algeria', 'Nigeria', 'geria'),
 ('Angola', 'Mongolia', 'ngol'),
 ('Armenia', 'Slovenia', 'enia'),
 ('Armenia', 'Turkmenistan', 'meni'),
 ('Australia', 'Austria', 'Austr'),
 ('Australia', 'Central African Republic', 'tral'),
 ('Australia', 'Somalia', 'alia'),
 ('Bahamas, The', 'Gambia, The', ', The'),
 ('Bahrain', 'Ukraine', 'rain'),
 ('Belarus', 'Brunei Darussalam', 'arus'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Príncipe', ' and '),
 ('Canada', 'Grenada', 'nada'),
 ('Central African Republic', 'Congo, Dem. Rep.', ' Rep'),
 ('Central African Republic', 'Congo, Rep.', ' Rep'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Egyp

In [17]:
df1[df1.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,iso_code_2,iso_code_3,country,data.3,data.4,5bv_cost_of_worker_dismissal,world_bank_region,"world_bank_current_income_classification,_1990-present",iso3,region
0,AL,ALB,Albania,23,34,6.299741,Europe & Central Asia,UM,,


In [18]:
df1.select_dtypes(exclude=['number']).columns.tolist()

['iso_code_2',
 'iso_code_3',
 'country',
 'data.3',
 'data.4',
 '5bv_cost_of_worker_dismissal',
 'world_bank_region',
 'world_bank_current_income_classification,_1990-present',
 'iso3',
 'region']

In [19]:
columns_to_convert = ['data.3', 'data.4', '5bv_cost_of_worker_dismissal']
df1[columns_to_convert] = df1[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [20]:
df1 = df1.drop(columns=['iso_code_2',
                        'iso_code_3',
                        # 'world_bank_region',
                        # 'region',
                        'world_bank_current_income_classification,_1990-present',
                        'iso3'])

### + FIW

In [21]:
fiw = fiw.drop(columns='region')
set(df1.columns).intersection(fiw.columns)

{'country', 'year'}

In [22]:
country_mapping = {
    'Bahamas, The': 'Bahamas',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Guinea-Bissau': 'Guinea Bissau',
    'South Korea': 'Korea, Rep.',
    'North Korea': 'Korea, Dem. Rep.',
    'United States': 'United States of America',
    'Russia': 'Russian Federation',
    'Sao Tome and Principe': 'São Tomé and Príncipe',
    'The Gambia': 'Gambia, The',
    'Hong Kong SAR, China': 'Hong Kong',
    'Iran, Islamic Rep.': 'Iran',
    'Egypt, Arab Rep.': 'Egypt',
    'Lao PDR': 'Laos',
    'Venezuela, RB': 'Venezuela',
    'Syrian Arab Republic': 'Syria',
    'Türkiye': 'Turkey',
    'Yemen, Rep.': 'Yemen',
    'Congo, Rep.': 'Congo (Brazzaville)',
    'Congo, Dem. Rep.': 'Congo (Kinshasa)',
    'Brunei Darussalam': 'Brunei',
    'Northern Cyprus': 'Cyprus',
    'Czechia': 'Czech Republic',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
    'Saint Lucia': 'St. Lucia'
}

df1['country'] = df1['country'].replace(country_mapping)
fiw['country'] = fiw['country'].replace(country_mapping)

df2 = pd.merge(df1, fiw, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df2['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'St. Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Príncipe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Congo (Kinshasa)', 'Congo (Brazzaville)', 'Congo ('),
 ("Cote d'Ivoire", 'Cote d’Ivoire', 'Cote d'),
 ('Czech Republic', 'Dominican Republic', ' Republic'),
 ('Dominican Republic', 'Dominica', 'Dominica'),
 ('Gambia, The', 'Zambia', 'ambia'),
 ('Guinea', 'Guinea Bissau', 'Guinea'),
 ('Guinea', 'Papua New Guinea',

In [23]:
df2[df2.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,pr,cl,total
0,Albania,Europe & Central Asia,,c,PF,27,39,66


In [24]:
columns_to_convert = ['pr', 'cl', 'total']
df2[columns_to_convert] = df2[columns_to_convert].apply(pd.to_numeric, errors='coerce')

### + IEF

In [25]:
set(df2.columns).intersection(ief.columns)

{'country', 'year'}

In [26]:
ief['country'] = ief['country'].str.strip()

country_mapping = {
    'Bahamas': 'The Bahamas',
    'Cabo Verde': 'Cape Verde',
    'Cote d\'Ivoire': "Côte d'Ivoire",
    "Côte d’Ivoire": "Côte d'Ivoire",
    'Guinea Bissau': 'Guinea-Bissau',
    'Korea, Rep.': 'South Korea',
    'Korea, Dem. Rep.': 'North Korea',
    'United States of America': 'United States',
    'Russia': 'Russian Federation',
    'São Tomé and Príncipe': 'São Tomé and Príncipe',
    'The Gambia': 'Gambia, The',
    'Republic of Congo': 'Congo (Brazzaville)',
    'Democratic Republic of Congo': 'Congo (Kinshasa)',
    'Swaziland': 'Eswatini',
    'Macedonia': 'North Macedonia',
    'Burma': 'Myanmar',
    'Bangladesh ': 'Bangladesh',
    'Brunei': 'Brunei Darussalam',
    'Congo (Brazzaville)': 'Republic of Congo',
    'Republic of Congo ': 'Republic of Congo',
    'Costa Rica ': 'Costa Rica',
    "Côte d'Ivoire": 'Cote d’Ivoire',
    "Côte d'Ivoire ": 'Cote d’Ivoire',
    'El Salvador ': 'El Salvador',
    'Guatemala ': 'Guatemala',
    'Kyrgyzstan': 'Kyrgyz Republic',
    'Netherlands': 'The Netherlands',
    'Philippines': 'The Philippines',
    'Slovakia': 'Slovak Republic',
    'St. Lucia': 'Saint Lucia',
    'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
    'Israeli Occupied Territories': 'Israeli-Occupied Territories'
}

df2['country'] = df2['country'].replace(country_mapping)
ief['country'] = ief['country'].replace(country_mapping)

df3 = pd.merge(df2, ief, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df3['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Príncipe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Central African Republic', 'Republic of Congo', 'Republic'),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Kyrgyz Republic', ' Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Congo (Kinshasa)', 'Republic of Congo', 'Congo'),
 ('Congo (Kinshasa)', 'Congo (Brazzaville)', 'Congo ('),
 ('Republic of 

In [27]:
df3[df3.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,short_name,iso_code
0,Albania,Europe & Central Asia,,c,PF,Albania,AL


In [28]:
df3 = df3.drop(columns=['iso_code', 'short_name'])

### + P5D

In [31]:
set(df3.columns).intersection(p5d.columns)

{'country', 'year'}

In [32]:
country_mapping = {
    'The Bahamas': 'Bahamas',
    'Cote d’Ivoire': "Cote D'Ivoire",
    "Côte d’Ivoire": "Cote D'Ivoire",
    'Republic of Congo': 'Congo Brazzaville',
    'Congo (Kinshasa)': 'Congo Kinshasa',
    'Kyrgyz Republic': 'Kyrgyzstan',
    'South Korea': 'Korea South',
    'North Korea': 'Korea North',
    'The Netherlands': 'Netherlands',
    'The Philippines': 'Philippines',
    'São Tomé and Príncipe': 'Sao Tome and Principe',
    'Timor-Leste': 'Timor Leste',
    'Eswatini': 'Swaziland',
    'USSR': 'Russia',
    'Myanmar': 'Myanmar (Burma)',
    'Vietnam North': 'Vietnam',
    'Vietnam South': 'Vietnam',
    'South Vietnam': 'Vietnam',
    'Yemen North': 'Yemen',
    'Sudan-North': 'Sudan',
    'Bosnia': 'Bosnia and Herzegovina',
    'Congo-Brazzaville': 'Congo Brazzaville',
    'Macedonia': 'North Macedonia',
    'Germany West': 'Germany',
    'Gambia, The': 'Gambia',
    'Yemen South': 'Yemen',
    'Russian Federation': 'Russia'
}

df3['country'] = df3['country'].replace(country_mapping)
p5d['country'] = p5d['country'].replace(country_mapping)

df4 = pd.merge(df3, p5d, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df4['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'Sao Tome and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Congo Kinshasa', 'Congo Brazzaville', 'Congo '),
 ('Congo Kinshasa', 'Congo (Brazzaville)', 'Congo '),
 ('Congo Brazzaville', 'Congo (Brazzaville)', 'Brazzaville'),
 ("Côte d'Ivoire", "Cote D'Ivoire", "'Ivoire"),
 ('Czech Republic', 'Dominican Repub

In [37]:
df4[df4.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,scode
0,Albania,Europe & Central Asia,,c,PF,


In [38]:
df4 = df4.drop(columns='scode')

### + PTS

In [39]:
set(df4.columns).intersection(pts.columns)

{'country', 'region', 'year'}

In [40]:
pts = pts.drop(columns='region')

In [45]:
country_mapping = {
    'Bolivia, Plurinational State of': 'Bolivia',
    "Cote d'Ivoire": "Cote D'Ivoire",
    'Republic of Congo': 'Congo Brazzaville',
    'Congo, the Democratic Republic of the': 'Congo Kinshasa',
    'Iran, Islamic Republic of': 'Iran',
    "Korea, Democratic People's Republic of": 'Korea North',
    'Korea, Republic of': 'Korea South',
    "Lao People's Democratic Republic": 'Laos',
    'Micronesia, Federated States of': 'Micronesia',
    'Moldova, Republic of': 'Moldova',
    'Myanmar': 'Myanmar (Burma)',
    'Sao Tome and Principe': 'Sao Tome and Principe',
    'Syrian Arab Republic': 'Syria',
    'Taiwan, Province of China': 'Taiwan',
    'Tanzania, United Republic of': 'Tanzania',
    'Venezuela, Bolivarian Republic of': 'Venezuela',
    'Viet Nam': 'Vietnam',
    'Western Sahara': 'Western Sahara',
    'Yemen Arab Republic': 'Yemen',
    "Yemen People's Republic": 'Yemen',
    'Yugoslavia, Federal Republic of': 'Yugoslavia',
    'Yugoslavia, Socialist Federal Republic of': 'Yugoslavia',
    'Swaziland': 'Eswatini',
    'Gambia': 'Gambia, The',
    'German Democratic Republic': 'Germany East',
    'German East': 'Germany East',
    'Ivory Coast': "Cote D'Ivoire",
    'UAE': 'United Arab Emirates',
    'Gambia, The': 'Gambia'
}

df4['country'] = df4['country'].replace(country_mapping)
pts['country'] = pts['country'].replace(country_mapping)

df5 = pd.merge(df4, pts, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df5['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'Sao Tome and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Saint Kitts and Nevis', ' and '),
 ('Cape Verde', 'Cabo Verde', ' Verde'),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Central African Republic', 'German Federal Republic', ' Republic'),
 ('Congo Kinshasa', 'Congo Brazzaville', 'Congo '),
 ('Congo Kinshasa', 'Cong

In [46]:
df5[df5.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,country_old,cow_code_a,wordbank_code_a
0,Albania,Europe & Central Asia,,c,PF,Albania,ALB,ALB


In [47]:
df5 = df5.drop(columns=['cow_code_a', 'country_old', 'wordbank_code_a'])

### + WGI

In [49]:
set(df5.columns).intersection(wgi.columns)

{'country', 'year'}

In [50]:
wgi['year'] = wgi['year'].astype(int)

In [51]:
country_mapping = {
    'Bahamas': 'Bahamas, The',
    "Cote D'Ivoire": "Côte d'Ivoire",
    'Cape Verde': 'Cabo Verde',
    'Congo Kinshasa': 'Congo, Dem. Rep.',
    'Congo Brazzaville': 'Congo, Rep.',
    'Korea South': 'Korea, Rep.',
    'Korea North': 'Korea, Dem. Rep.',
    'Laos': 'Lao PDR',
    'Micronesia': 'Micronesia, Fed. Sts.',
    'Myanmar (Burma)': 'Myanmar',
    'Sao Tome and Principe': 'São Tomé and Principe',
    'Swaziland': 'Eswatini',
    'Ivory Coast': "Côte d'Ivoire",
    'UAE': 'United Arab Emirates',
    'Taiwan': 'Taiwan, China',
    'Russia': 'Russian Federation',
    'Yemen': 'Yemen, Rep.',
    'Venezuela': 'Venezuela, RB',
    'Türkiye': 'Turkey',
    'Egypt': 'Egypt, Arab Rep.',
    'Gambia': 'Gambia, The',
    'Hong Kong': 'Hong Kong SAR, China',
    'Kyrgyzstan': 'Kyrgyz Republic',
    'Syria': 'Syrian Arab Republic',
    'Saint Lucia': 'St. Lucia',
    'Timor Leste': 'Timor-Leste'
}

df5['country'] = df5['country'].replace(country_mapping)
wgi['country'] = wgi['country'].replace(country_mapping)

df6 = pd.merge(df5, wgi, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df6['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bahamas, The', 'Gambia, The', ', The'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Saint Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'St. Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'West Bank and Gaza', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Kyrgyz Republic', ' Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Centra

In [52]:
df6[df6.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,code
0,Albania,Europe & Central Asia,,c,PF,ALB


In [53]:
df6 = df6.drop(columns='code')

### + WB

In [54]:
set(df6.columns).intersection(wb.columns)

{'country', 'year'}

In [55]:
country_groups = [
    '(UN) Africa', '(UN) Asia', '(UN) Europe', '(UN) Latin America and the Caribbean', 
    '(UN) North America', '(SDG) Central/Southern Asia', '(SDG) Eastern/South Eastern Asia', 
    '(SDG) Europe', '(SDG) Latin America & the Caribbean', '(SDG) Northern America', 
    '(SDG) Sub-Saharan Africa', '(SDG) Western Asia/Northern Africa',
    'Africa Eastern and Southern', 'Africa Western and Central',
    'Association of Southeast Asian Nations (ASEAN-5)',
    'Central Europe and the Baltics', 'Caribbean small states',
    'East Asia & Pacific (excluding high income)', 'East Asia & Pacific',
    'Europe & Central Asia (excluding high income)', 'Europe & Central Asia',
    'Emerging and Developing Europe', 'Euro area',
    'Europe & Central Asia (IDA & IBRD countries)',
    'Latin America & Caribbean (excluding high income)', 'Latin America & Caribbean',
    'Least developed countries: UN classification', 'Middle East & North Africa',
    'Middle East & Central Asia', 'Pacific island small states',
    'South Asia (IDA & IBRD)', 'Small states',
    'Sub-Saharan Africa (excluding high income)',
    'Sub-Saharan Africa (IDA & IBRD countries)',
    'Upper middle income', 'IBRD only', 'IDA & IBRD total', 'IDA total', 'IDA blend',
    'OECD members', 'OECD Member Countries',
    'Other Advanced Economies', 'Other small states',
    'Heavily indebted poor countries (HIPC)', 'High income', 'Low income', 'Lower middle income',
    'Low & middle income', 'Post-demographic dividend', 'Pre-demographic dividend',
    'Fragile and conflict affected situations', 'East Asia & Pacific (IDA & IBRD countries)',
    'Latin America & the Caribbean (IDA & IBRD countries)',
    'Middle East & North Africa (IDA & IBRD countries)', '(WHO) Africa Region',
    '(WHO) America Region', '(WHO) South-East Asia Region',
    '(WHO) Eastern Mediterranean Region', '(WHO) European Region',
    '(WHO) Western Pacific Region', 'Unknown or unspecified', 'Arab World',
    'Developed Asia', 'Early-demographic dividend', 'FAO (Food and Agriculture Organization)',
    'IDA only', 'Late-demographic dividend', 'Middle income',
    'Middle East & North Africa (excluding high income)',
    'World Bank Group', 'Western Europe', 'World',
    'European Union', 'Euro Area', 'Sub-Saharan Africa', 'North America',
    'South Asia']

wb = wb[~wb['country'].isin(country_groups)].reset_index(drop=True)

In [56]:
country_mapping = {
    'Bahamas, The': 'Bahamas',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Cabo Verde': 'Cape Verde',
    'Congo, Dem. Rep.': "Congo, Dem. People's Rep.",
    'Congo, Rep.': 'Congo, Rep.',
    'Korea, Rep.': 'Korea, South',
    'Korea, Dem. Rep.': "Korea, Dem. People's Rep.",
    'Lao PDR': 'Laos',
    'Micronesia, Fed. Sts.': 'Micronesia',
    'Myanmar': 'Burma',
    'São Tomé and Principe': 'Sao Tome and Principe',
    'Eswatini': 'Swaziland',
    'Taiwan, China': 'Taiwan',
    'Venezuela, RB': 'Venezuela',
    'Turkiye': 'Turkey',
    'Vietnam': 'Viet Nam',
    'Yemen, Rep.': 'Yemen',
    'Macao SAR, China': 'Macau',
    'West Bank and Gaza': 'Palestine',
    'Virgin Islands (U.S.)': 'United States Virgin Islands',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
    'Netherlands Antilles (former)': 'Netherlands Antilles',
    'Réunion': 'Reunion',
    'Palestinian Authority Administered Territories': 'Palestine',
    'Palestinian Authority-Administered Territories': 'Palestine'
}

df6['country'] = df6['country'].replace(country_mapping)
wb['country'] = wb['country'].replace(country_mapping)

df7 = pd.merge(df6, wb, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df7['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'St. Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'Sao Tome and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Saint Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Turks and Caicos Islands', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Kyrgyz Republic', ' Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Central African Republic', 'Syrian Arab Republic', ' Republ

In [57]:
df7[df7.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,country_code
0,Albania,Europe & Central Asia,,c,PF,ALB


In [58]:
df7 = df7.drop(columns='country_code')

In [59]:
merged = df7.copy()
merged.to_pickle("smerged.dat")

## Cleaning

In [3]:
# Load
merged = pd.read_pickle("saved/merged.dat")

# Sort
merged = merged.sort_values(by=['country', 'year']).reset_index(drop=True)

In [4]:
# Fill NaNs and unify
merged = merge_country_observations(merged, ['Iran'], ['Iran, Islamic Rep.'])
merged = merge_country_observations(merged, ['Slovak Republic'], ['Slovakia'])
merged = merge_country_observations(merged, ['Congo, Rep.'], ['Congo (Brazzaville)', 'Congo'])
merged = merge_country_observations(merged, ['Channel Islands'], ['Jersey, Channel Islands'])

merged = merge_country_observations(merged, ['Czech Republic', 'Slovakia'], ['Czechoslovakia'])
merged = merge_country_observations(merged, ['Germany'], ['Germany East', 'German Federal Republic'])
merged = merge_country_observations(merged, ['Serbia', 'Montenegro'], ['Serbia and Montenegro'])

merged = merge_country_observations(merged, ['Israel'], ['Israel in pre-1967 borders'])
merged = merge_country_observations(merged, ['Palestine'], ['Palestine, State of'])
merged = merge_country_observations(merged, ['Gaza Strip'], ['Gaza (Hamas)'])
merged = merge_country_observations(merged, ['West Bank'], ['Israeli-Occupied Territories', 'Israel in Occupied Territories'])
merged = merge_country_observations(merged, ['Palestine'], ['West Bank', 'Gaza Strip'])

merged = merge_country_observations(merged, ['Croatia', 'Serbia', 'Albania',
                                             'Kosovo', 'Bosnia and Herzegovina',
                                             'Montenegro', 'Slovenia',
                                             'North Macedonia'], ['Yugoslavia'])
merged = merge_country_observations(merged, ['Russian Federation', 'Estonia',
                                             'Lithuania', 'Latvia', 'Ukraine',
                                             'Belarus', 'Kazakhstan', 'Turkmenistan',
                                             'Tajikistan', 'Kyrgyz Republic', 'Azerbaijan',
                                             'Azerbaijan', 'Armenia', 'Uzbekistan',
                                             'Moldova'], ['Soviet Union'])

['Iran'] ['Iran, Islamic Rep.']
Elapsed time: 1.719426 seconds
['Slovak Republic'] ['Slovakia']
Elapsed time: 1.569821 seconds
['Congo, Rep.'] ['Congo (Brazzaville)', 'Congo']
Elapsed time: 1.544134 seconds
['Channel Islands'] ['Jersey, Channel Islands']
Elapsed time: 1.522149 seconds
['Czech Republic', 'Slovakia'] ['Czechoslovakia']
Elapsed time: 2.455398 seconds
['Germany'] ['Germany East', 'German Federal Republic']
Elapsed time: 1.773499 seconds
['Serbia', 'Montenegro'] ['Serbia and Montenegro']
Elapsed time: 2.874475 seconds
['Israel'] ['Israel in pre-1967 borders']
Elapsed time: 1.51792 seconds
['Palestine'] ['Palestine, State of']
Elapsed time: 1.640733 seconds
['Gaza Strip'] ['Gaza (Hamas)']
Elapsed time: 1.137153 seconds
['West Bank'] ['Israeli-Occupied Territories', 'Israel in Occupied Territories']
Elapsed time: 1.131995 seconds
['Palestine'] ['West Bank', 'Gaza Strip']
Elapsed time: 1.508181 seconds
['Croatia', 'Serbia', 'Albania', 'Kosovo', 'Bosnia and Herzegovina', 'Monte

KeyboardInterrupt: 

In [52]:
# Check and drop countries over 90% NaNs
merged['nan_percentage'] = merged.isnull().mean(axis=1)
result = merged.groupby('country').apply(lambda x: (x['nan_percentage'] > 0.9).mean())
high_nan_countries = result[result > 0.9].index
print(high_nan_countries)
merged = merged[~merged['country'].isin(high_nan_countries)]

Index(['Abkhazia', 'Anguilla', 'Chechnya', 'Cook Islands', 'Crimea',
       'Eastern Donbas', 'European Union', 'French Guiana', 'Indian Kashmir',
       'Martinique', 'Nagorno-Karabakh', 'Netherlands Antilles', 'Niue',
       'Pakistani Kashmir', 'Reunion', 'Russia-Occupied Areas (Ukraine)',
       'Saint Kitts and Nevis', 'Somaliland', 'South Ossetia',
       'St. Martin (French part)', 'Tibet', 'Transnistria', 'Western Sahara'],
      dtype='object', name='country')


In [53]:
# Check and drop cols over 95% NaNs
nan_percentage = merged.isna().mean()
cols_to_remove = nan_percentage[nan_percentage > 0.95].index.tolist()
print(cols_to_remove)
merged = merged.drop(columns=cols_to_remove)

['region', 'cpi_score_2022', 'cpi_rank', 'standard_error', 'number_of_sources', 'lower_ci', 'upper_ci', 'african_development_bank_cpia', 'bertelsmann_foundation_sustainable_governance_index', 'bertelsmann_foundation_transformation_index', 'economist_intelligence_unit_country_ratings', 'freedom_house_nations_in_transit', 'global_insights_country_risk_ratings', 'imd_world_competitiveness_yearbook', 'perc_asia_risk_guide', 'prs_international_country_risk_guide', 'varieties_of_democracy_project', 'world_bank_cpia', 'world_economic_forum_eos', 'world_justice_project_rule_of_law_index', 'interim', 'sf']


In [54]:
# Find region cols and rename
print([col for col in merged.columns if 'region' in col.lower()])
merged = merged.rename(columns={'world_bank_region':'region'})

['world_bank_region']


In [55]:
# Fill NaNs in region col
merged['region'] = merged.groupby('country')['region'].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x))

In [56]:
# Manually fill rest
country_region_map = {
    'Afghanistan': 'Middle East & North Africa',
    'American Samoa': 'East Asia & Pacific',
    'Andorra': 'Europe & Central Asia',
    'Antigua and Barbuda': 'Latin America & the Caribbean',
    'Aruba': 'Latin America & the Caribbean',
    'Bermuda': 'Latin America & the Caribbean',
    'British Virgin Islands': 'Latin America & the Caribbean',
    'Cayman Islands': 'Latin America & the Caribbean',
    'Congo': 'Sub-Saharan Africa',
    'Cuba': 'Latin America & the Caribbean',
    'Curacao': 'Latin America & the Caribbean',
    'Dominica': 'Latin America & the Caribbean',
    'Equatorial Guinea': 'Sub-Saharan Africa',
    'Eritrea': 'Sub-Saharan Africa',
    'Faroe Islands': 'Europe & Central Asia',
    'French Polynesia': 'East Asia & Pacific',
    'Gibraltar': 'Europe & Central Asia',
    'Greenland': 'Europe & Central Asia',
    'Grenada': 'Latin America & the Caribbean',
    'Guam': 'East Asia & Pacific',
    'Isle of Man': 'Europe & Central Asia',
    'Channel Islands': 'Europe & Central Asia',
    'Kiribati': 'East Asia & Pacific',
    'Korea, Dem. People\'s Rep.': 'East Asia & Pacific',
    'Kosovo': 'Europe & Central Asia',
    'Liechtenstein': 'Europe & Central Asia',
    'Macau': 'East Asia & Pacific',
    'Maldives': 'South Asia',
    'Marshall Islands': 'East Asia & Pacific',
    'Micronesia': 'East Asia & Pacific',
    'Monaco': 'Europe & Central Asia',
    'Nauru': 'East Asia & Pacific',
    'New Caledonia': 'East Asia & Pacific',
    'Northern Mariana Islands': 'East Asia & Pacific',
    'Palau': 'East Asia & Pacific',
    'Palestine': 'Middle East & North Africa',
    'Puerto Rico': 'Latin America & the Caribbean',
    'Samoa': 'East Asia & Pacific',
    'San Marino': 'Europe & Central Asia',
    'Sao Tome and Principe': 'Sub-Saharan Africa',
    'Sint Maarten (Dutch part)': 'Latin America & the Caribbean',
    'Slovakia': 'Europe & Central Asia',
    'Solomon Islands': 'East Asia & Pacific',
    'South Sudan': 'Sub-Saharan Africa',
    'St. Kitts and Nevis': 'Latin America & the Caribbean',
    'St. Lucia': 'Latin America & the Caribbean',
    'St. Vincent and the Grenadines': 'Latin America & the Caribbean',
    'Tonga': 'East Asia & Pacific',
    'Turkmenistan': 'Europe & Central Asia',
    'Turks and Caicos Islands': 'Latin America & the Caribbean',
    'Tuvalu': 'East Asia & Pacific',
    'United States Virgin Islands': 'Latin America & the Caribbean',
    'Uzbekistan': 'Europe & Central Asia',
    'Vanuatu': 'East Asia & Pacific'}

merged['region'] = merged['country'].map(country_region_map).fillna(merged['region'])

In [57]:
c = merged[['country', 'region']].drop_duplicates()
nan_max_per_country = merged.groupby('country')['nan_percentage'].mean().reset_index()
c = c.merge(nan_max_per_country, on='country', how='left')
min_year = merged.groupby('country')['year'].min().reset_index()
c = c.merge(min_year, on='country', how='left')
c.sort_values(by='nan_percentage')

Unnamed: 0,country,region,nan_percentage,year
52,Cyprus,Europe & Central Asia,0.451226,1960
127,Mexico,Latin America & the Caribbean,0.457917,1960
154,Peru,Latin America & the Caribbean,0.458423,1960
43,Colombia,Latin America & the Caribbean,0.458626,1960
133,Morocco,Middle East & North Africa,0.459031,1960
...,...,...,...,...
51,Curacao,Latin America & the Caribbean,0.907402,1960
40,Channel Islands,Europe & Central Asia,0.916678,1960
172,Sint Maarten (Dutch part),Latin America & the Caribbean,0.924063,1960
145,Northern Mariana Islands,East Asia & Pacific,0.924132,1960


In [58]:
merged.to_pickle("cleaned.dat")

# Fill NULLs, standardise and target

## Fill NULLs

In [61]:
cleaned = pd.read_pickle("saved/cleaned.dat")

In [60]:
def fill_all_data(df, numeric_cols, country_col='country', region_col='region', year_col='year'):
    def forward_fill_country(group):
        print(group.country.unique())
        for col in numeric_cols:
            if col + '_fill' not in group.columns:
                group[col + '_fill'] = np.nan
            group[col + '_fill'] = group[col + '_fill'].where(group[col].isna(), group[year_col])
            group[col + '_fill'] = group[col + '_fill'].ffill()
            group[col] = group[col].ffill()
        return group

    def backward_fill_country(group):
        print(group.country.unique())
        for col in numeric_cols:
            mask_still_na = group[col].isna()
            next_valid_year = group[year_col].where(group[col].notna()).bfill()
            fill_idx = mask_still_na & group[col + '_fill'].isna()
            group.loc[fill_idx, col + '_fill'] = next_valid_year
            group[col] = group[col].bfill()
        return group

    print("\n-------------------------- FFILL --------------------------")
    df = df.groupby(country_col, group_keys=False).apply(forward_fill_country)
    print("\n-------------------------- BFILL --------------------------")
    df = df.groupby(country_col, group_keys=False).apply(backward_fill_country)

    print("\n-------------------------- orig REGION FILL --------------------------")
    for col in numeric_cols:
        orig_df = df[df[col + '_fill']==df['year']]
        df['_region_year_mean'] = orig_df.groupby([region_col, year_col])[col].transform('mean')
        mask_still_na = df[col].isna()
        df.loc[mask_still_na, col] = df.loc[mask_still_na, '_region_year_mean']
        just_filled = mask_still_na & df[col].notna()
        df.loc[just_filled, col + '_fill'] = -df.loc[just_filled, year_col]
        df.drop(columns='_region_year_mean', inplace=True)

    print("\n-------------------------- orig WORLD FILL --------------------------")
    for col in numeric_cols:
        orig_df = df[df[col + '_fill']==df['year']]
        df['_world_year_mean'] = orig_df.groupby(year_col)[col].transform('mean')
        mask_still_na = df[col].isna()
        df.loc[mask_still_na, col] = df.loc[mask_still_na, '_world_year_mean']
        just_filled = mask_still_na & df[col].notna()
        df.loc[just_filled, col + '_fill'] = 0
        df.drop(columns='_world_year_mean', inplace=True)
        
    print("\n-------------------------- rest REGION FILL --------------------------")
    for col in numeric_cols:
        df['_region_year_mean'] = df.groupby([region_col, year_col])[col].transform('mean')
        mask_still_na = df[col].isna()
        df.loc[mask_still_na, col] = df.loc[mask_still_na, '_region_year_mean']
        just_filled = mask_still_na & df[col].notna()
        df.loc[just_filled, col + '_fill'] = -df.loc[just_filled, year_col]
        df.drop(columns='_region_year_mean', inplace=True)

    print("\n-------------------------- rest WORLD FILL --------------------------")
    for col in numeric_cols:
        df['_world_year_mean'] = df.groupby(year_col)[col].transform('mean')
        mask_still_na = df[col].isna()
        df.loc[mask_still_na, col] = df.loc[mask_still_na, '_world_year_mean']
        just_filled = mask_still_na & df[col].notna()
        df.loc[just_filled, col + '_fill'] = 0
        df.drop(columns='_world_year_mean', inplace=True)

    return df

In [62]:
numeric_cols = [
    col for col in cleaned.columns 
    if col not in ['country','year', 'nan_percentage'] 
    and pd.api.types.is_numeric_dtype(merged[col])
]

filled = fill_all_data(
    cleaned,
    numeric_cols=numeric_cols,
    country_col='country',
    region_col='region',
    year_col='year'
)


-------------------------- FFILL --------------------------
['Afghanistan']
['Albania']
['Algeria']
['American Samoa']
['Andorra']
['Angola']
['Antigua and Barbuda']
['Argentina']
['Armenia']
['Aruba']
['Australia']
['Austria']
['Azerbaijan']
['Bahamas']
['Bahrain']
['Bangladesh']
['Barbados']
['Belarus']
['Belgium']
['Belize']
['Benin']
['Bermuda']
['Bhutan']
['Bolivia']
['Bosnia and Herzegovina']
['Botswana']
['Brazil']
['British Virgin Islands']
['Brunei Darussalam']
['Bulgaria']
['Burkina Faso']
['Burma']
['Burundi']
['Cambodia']
['Cameroon']
['Canada']
['Cape Verde']
['Cayman Islands']
['Central African Republic']
['Chad']
['Channel Islands']
['Chile']
['China']
['Colombia']
['Comoros']
["Congo, Dem. People's Rep."]
['Congo, Rep.']
['Costa Rica']
["Cote d'Ivoire"]
['Croatia']
['Cuba']
['Curacao']
['Cyprus']
['Czech Republic']
['Denmark']
['Djibouti']
['Dominica']
['Dominican Republic']
['Ecuador']
['Egypt, Arab Rep.']
['El Salvador']
['Equatorial Guinea']
['Eritrea']
['Estonia']


In [64]:
filled.to_pickle("saved/filled.dat")

## Target
Maybe the target should not have been filled beforehand \
Can always just exclude those obs where target_fill != year

In [189]:
def calculate_gdp5(df, country, base_year):
    base_year = float(base_year)
    gdp_base = df.loc[(df['country'] == country) & (df['year'].astype(float) == base_year), 'NY.GDP.PCAP.CD']
    gdp_later = df.loc[(df['country'] == country) & (df['year'].astype(float) == base_year + 5), 'NY.GDP.PCAP.CD']
    if not gdp_base.empty and not gdp_later.empty:
        return ((gdp_later.values[0] - gdp_base.values[0]) / gdp_base.values[0]) * 100
    return None

df = pd.read_pickle("saved/filled.dat")
df['GDP5'] = df.apply(lambda row: calculate_gdp5(df, row['country'], row['year']) if float(row['year']) <= 2017 else None, axis=1)
df = df[df['GDP5'].notnull()]
df['GDP5'] = (df['GDP5'] / 5).round(0)

bins = [-float('inf'), 1, 6, 12, float('inf')]
labels = [0, 1, 2, 3]
df['GDP5_cat'] = pd.cut(df['GDP5'], bins=bins, labels=labels)

bins = [-float('inf'), 1, float('inf')]
labels = [0, 1]
df['GDP5_2cat'] = pd.cut(df['GDP5'], bins=bins, labels=labels)

df.to_pickle("saved/targeted.dat")

In [147]:
df[['country', 'year', 'NY.GDP.PCAP.CD', 'NY.GDP.PCAP.CD_fill', 'GDP5', 'GDP5_2cat']]

Unnamed: 0,country,year,NY.GDP.PCAP.CD,NY.GDP.PCAP.CD_fill,GDP5,GDP5_2cat
21,Afghanistan,1960,60.950364,1962.0,15.0,1
22,Afghanistan,1961,60.950364,1962.0,27.0,1
23,Afghanistan,1962,60.950364,1962.0,35.0,1
24,Afghanistan,1963,82.021738,1963.0,13.0,1
25,Afghanistan,1964,85.511073,1964.0,11.0,1
...,...,...,...,...,...,...
15808,Zimbabwe,2013,1408.367810,2013.0,12.0,1
15809,Zimbabwe,2014,1407.034291,2014.0,0.0,0
15810,Zimbabwe,2015,1410.329173,2015.0,-1.0,0
15811,Zimbabwe,2016,1421.787791,2016.0,5.0,1


## Standardise

In [212]:
df = pd.read_pickle("saved/targeted.dat")

In [213]:
geo = pd.read_pickle("dpWB.dat")[['country_name', 'NW_point_lon',
       'NW_point_lat', 'NE_point_lon', 'NE_point_lat', 'SE_point_lon',
       'SE_point_lat', 'SW_point_lon', 'SW_point_lat']]
geo = geo.rename(columns={'country_name':'country'})
geo = geo[geo.NW_point_lon.notnull()]

df = pd.merge(df, geo, on='country', how='left')

In [214]:
df['year_str'] = df['year'].astype(str)
df = df.sort_values(by=['country', 'year_str']).reset_index(drop=True)

dfst = standardize_dataframe(df, target='GDP5_cat', exclude=['year_str', 'GDP5', 'GDP5_2cat'])

dfst.fillna(0, inplace=True)
dfst.to_pickle("saved/ready.dat")

# Model setup

## Train and test set

In [3]:
dfst = pd.read_pickle("saved/ready.dat")

In [4]:
dfst.columns

Index(['year', 'country', 'economic_freedom_summary_index', 'efw_rank',
       'quartile', '1a_government_consumption', 'data',
       '1b__transfers_and_subsidies', 'data.1', '1c__government_investment',
       ...
       'GDP5_2cat', 'NW_point_lon', 'NW_point_lat', 'NE_point_lon',
       'NE_point_lat', 'SE_point_lon', 'SE_point_lat', 'SW_point_lon',
       'SW_point_lat', 'year_str'],
      dtype='object', length=890)

In [None]:
from sklearn.model_selection import train_test_split

dfst = pd.read_pickle("saved/ready.dat")
past_observation_count = 5
target_variable = 'GDP5_cat'
columns_to_exclude = ['GDP5_2cat', 'GDP5_cat', 'year_str', 'country', 'GDP5', 'region', 'c/t?', 'status']

target_array = []
explanatory_array = []

grouped = dfst.groupby('country')

for _, group_df in grouped:
    target_values = group_df[target_variable].values
    explanatory_values = group_df.drop(columns=columns_to_exclude).values
    for i in range(past_observation_count, len(target_values)):
        target_array.append(target_values[i])
        explanatory_array.append(explanatory_values[i - past_observation_count:i])

X = np.array(explanatory_array, dtype=np.float32)
y = np.array(target_array, dtype=np.int32)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=36)

def relu(x):
    return np.clip(x,0,np.inf)
def softmax(x):
    return [np.exp(x[i])/np.sum(np.exp(x)) for i in range(len(x))]

## Run model
Is it correct that I have 10-90 validation split and 5-95 train-test split

In [None]:
from keras.layers import LSTM
from tensorflow.keras import datasets, layers, models
from tensorflow import keras

model = keras.Sequential([
    keras.layers.LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    keras.layers.LSTM(50, return_sequences=False),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(17, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=300, batch_size=100, validation_split=0.1, callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)])
test_loss, test_acc = model.evaluate(X_test, y_test)
print('\nTest accuracy:', test_acc)

2025-02-09 18:03:38.552487: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-09 18:03:38.918876: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-09 18:03:39.439476: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/300


2025-02-09 18:04:01.210472: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 8913509640 exceeds 10% of free system memory.


[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 6ms/step - accuracy: 0.6992 - loss: 0.7166 - val_accuracy: 0.8665 - val_loss: 0.3221
Epoch 2/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.8850 - loss: 0.2845 - val_accuracy: 0.9122 - val_loss: 0.2136
Epoch 3/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9190 - loss: 0.2046 - val_accuracy: 0.9353 - val_loss: 0.1735
Epoch 4/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9363 - loss: 0.1668 - val_accuracy: 0.9410 - val_loss: 0.1531
Epoch 5/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9445 - loss: 0.1478 - val_accuracy: 0.9492 - val_loss: 0.1319
Epoch 6/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9498 - loss: 0.1339 - val_accuracy: 0.9525 - val_loss: 0.1288
Epoch 7/300
[1m