In [2]:
import pandas as pd
import numpy as np
from functools import reduce
import difflib
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor
import warnings; warnings.filterwarnings("ignore")

In [8]:
p5d = pd.read_excel("IQD/P5D.xls")
p5d = p5d[p5d.year>=1960].reset_index(drop=True)
p5d

Unnamed: 0,p5,cyear,ccode,scode,country,year,flag,fragment,democ,autoc,...,interim,bmonth,bday,byear,bprec,post,change,d5,sf,regtrans
0,0,7001960,700,AFG,Afghanistan,1960,0,,0,10,...,,,,,,,,,,
1,0,7001961,700,AFG,Afghanistan,1961,0,,0,10,...,,,,,,,,,,
2,0,7001962,700,AFG,Afghanistan,1962,0,,0,10,...,,,,,,,,,,
3,0,7001963,700,AFG,Afghanistan,1963,0,,0,10,...,,,,,,,,,,
4,0,7001964,700,AFG,Afghanistan,1964,0,,0,7,...,,9.0,10.0,1964.0,1.0,-7.0,3.0,1.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,1,5522014,552,ZIM,Zimbabwe,2014,0,0.0,5,1,...,,,,,,,,,,
8796,1,5522015,552,ZIM,Zimbabwe,2015,0,0.0,5,1,...,,,,,,,,,,
8797,1,5522016,552,ZIM,Zimbabwe,2016,0,0.0,5,1,...,,,,,,,,,,
8798,1,5522017,552,ZIM,Zimbabwe,2017,0,0.0,5,1,...,,,,,,,,,,


### CPI + EFW

In [14]:
set(cpi.columns).intersection(efw.columns)

{'country', 'rank', 'year'}

In [15]:
cpi = cpi.rename(columns={'rank':'cpi_rank'})
efw = efw.rename(columns={'rank':'efw_rank'})

In [16]:
country_mapping = {
    'Bahamas': 'Bahamas, The',
    'Cote d\'Ivoire': "Côte d'Ivoire",
    'Guinea Bissau': 'Guinea-Bissau',
    'Korea, North': 'North Korea',
    'Korea, South': 'South Korea',
    'United States of America': 'United States',
    'Russia': 'Russian Federation',
    'Sao Tome and Principe': 'São Tomé and Príncipe',
    'Gambia': 'Gambia, The',
    'Hong Kong': 'Hong Kong SAR, China',
    'Iran': 'Iran, Islamic Rep.',
    'Egypt': 'Egypt, Arab Rep.',
    'Laos': 'Lao PDR',
    'Venezuela': 'Venezuela, RB',
    'Syria': 'Syrian Arab Republic',
    'Turkey': 'Türkiye',
    'Yemen': 'Yemen, Rep.',
    'Congo': 'Congo, Rep.',
    'Democratic Republic of the Congo': 'Congo, Dem. Rep.',
    'Korea, Rep.': 'South Korea',
    'Kyrgyz Republic': 'Kyrgyzstan',
    'Slovak Republic': 'Slovakia'
}

cpi['country'] = cpi['country'].replace(country_mapping)
efw['country'] = efw['country'].replace(country_mapping)

df1 = pd.merge(efw, cpi, on=['country', 'year'], how='outer')

def check_countries(column, min_length=4):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df1['country'].unique())

[('Albania', 'Lithuania', 'ania'),
 ('Albania', 'Mauritania', 'ania'),
 ('Albania', 'Romania', 'ania'),
 ('Albania', 'Tanzania', 'ania'),
 ('Algeria', 'Liberia', 'eria'),
 ('Algeria', 'Nigeria', 'geria'),
 ('Angola', 'Mongolia', 'ngol'),
 ('Armenia', 'Slovenia', 'enia'),
 ('Armenia', 'Turkmenistan', 'meni'),
 ('Australia', 'Austria', 'Austr'),
 ('Australia', 'Central African Republic', 'tral'),
 ('Australia', 'Somalia', 'alia'),
 ('Bahamas, The', 'Gambia, The', ', The'),
 ('Bahrain', 'Ukraine', 'rain'),
 ('Belarus', 'Brunei Darussalam', 'arus'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Príncipe', ' and '),
 ('Canada', 'Grenada', 'nada'),
 ('Central African Republic', 'Congo, Dem. Rep.', ' Rep'),
 ('Central African Republic', 'Congo, Rep.', ' Rep'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Egyp

In [17]:
df1[df1.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,iso_code_2,iso_code_3,country,data.3,data.4,5bv_cost_of_worker_dismissal,world_bank_region,"world_bank_current_income_classification,_1990-present",iso3,region
0,AL,ALB,Albania,23,34,6.299741,Europe & Central Asia,UM,,


In [18]:
df1.select_dtypes(exclude=['number']).columns.tolist()

['iso_code_2',
 'iso_code_3',
 'country',
 'data.3',
 'data.4',
 '5bv_cost_of_worker_dismissal',
 'world_bank_region',
 'world_bank_current_income_classification,_1990-present',
 'iso3',
 'region']

In [19]:
columns_to_convert = ['data.3', 'data.4', '5bv_cost_of_worker_dismissal']
df1[columns_to_convert] = df1[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [20]:
df1 = df1.drop(columns=['iso_code_2',
                        'iso_code_3',
                        # 'world_bank_region',
                        # 'region',
                        'world_bank_current_income_classification,_1990-present',
                        'iso3'])

### + FIW

In [21]:
fiw = fiw.drop(columns='region')
set(df1.columns).intersection(fiw.columns)

{'country', 'year'}

In [22]:
country_mapping = {
    'Bahamas, The': 'Bahamas',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Guinea-Bissau': 'Guinea Bissau',
    'South Korea': 'Korea, Rep.',
    'North Korea': 'Korea, Dem. Rep.',
    'United States': 'United States of America',
    'Russia': 'Russian Federation',
    'Sao Tome and Principe': 'São Tomé and Príncipe',
    'The Gambia': 'Gambia, The',
    'Hong Kong SAR, China': 'Hong Kong',
    'Iran, Islamic Rep.': 'Iran',
    'Egypt, Arab Rep.': 'Egypt',
    'Lao PDR': 'Laos',
    'Venezuela, RB': 'Venezuela',
    'Syrian Arab Republic': 'Syria',
    'Türkiye': 'Turkey',
    'Yemen, Rep.': 'Yemen',
    'Congo, Rep.': 'Congo (Brazzaville)',
    'Congo, Dem. Rep.': 'Congo (Kinshasa)',
    'Brunei Darussalam': 'Brunei',
    'Northern Cyprus': 'Cyprus',
    'Czechia': 'Czech Republic',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
    'Saint Lucia': 'St. Lucia'
}

df1['country'] = df1['country'].replace(country_mapping)
fiw['country'] = fiw['country'].replace(country_mapping)

df2 = pd.merge(df1, fiw, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df2['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'St. Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Príncipe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Congo (Kinshasa)', 'Congo (Brazzaville)', 'Congo ('),
 ("Cote d'Ivoire", 'Cote d’Ivoire', 'Cote d'),
 ('Czech Republic', 'Dominican Republic', ' Republic'),
 ('Dominican Republic', 'Dominica', 'Dominica'),
 ('Gambia, The', 'Zambia', 'ambia'),
 ('Guinea', 'Guinea Bissau', 'Guinea'),
 ('Guinea', 'Papua New Guinea',

In [23]:
df2[df2.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,pr,cl,total
0,Albania,Europe & Central Asia,,c,PF,27,39,66


In [24]:
columns_to_convert = ['pr', 'cl', 'total']
df2[columns_to_convert] = df2[columns_to_convert].apply(pd.to_numeric, errors='coerce')

### + IEF

In [25]:
set(df2.columns).intersection(ief.columns)

{'country', 'year'}

In [26]:
ief['country'] = ief['country'].str.strip()

country_mapping = {
    'Bahamas': 'The Bahamas',
    'Cabo Verde': 'Cape Verde',
    'Cote d\'Ivoire': "Côte d'Ivoire",
    "Côte d’Ivoire": "Côte d'Ivoire",
    'Guinea Bissau': 'Guinea-Bissau',
    'Korea, Rep.': 'South Korea',
    'Korea, Dem. Rep.': 'North Korea',
    'United States of America': 'United States',
    'Russia': 'Russian Federation',
    'São Tomé and Príncipe': 'São Tomé and Príncipe',
    'The Gambia': 'Gambia, The',
    'Republic of Congo': 'Congo (Brazzaville)',
    'Democratic Republic of Congo': 'Congo (Kinshasa)',
    'Swaziland': 'Eswatini',
    'Macedonia': 'North Macedonia',
    'Burma': 'Myanmar',
    'Bangladesh ': 'Bangladesh',
    'Brunei': 'Brunei Darussalam',
    'Congo (Brazzaville)': 'Republic of Congo',
    'Republic of Congo ': 'Republic of Congo',
    'Costa Rica ': 'Costa Rica',
    "Côte d'Ivoire": 'Cote d’Ivoire',
    "Côte d'Ivoire ": 'Cote d’Ivoire',
    'El Salvador ': 'El Salvador',
    'Guatemala ': 'Guatemala',
    'Kyrgyzstan': 'Kyrgyz Republic',
    'Netherlands': 'The Netherlands',
    'Philippines': 'The Philippines',
    'Slovakia': 'Slovak Republic',
    'St. Lucia': 'Saint Lucia',
    'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
    'Israeli Occupied Territories': 'Israeli-Occupied Territories'
}

df2['country'] = df2['country'].replace(country_mapping)
ief['country'] = ief['country'].replace(country_mapping)

df3 = pd.merge(df2, ief, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df3['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Príncipe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Central African Republic', 'Republic of Congo', 'Republic'),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Kyrgyz Republic', ' Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Congo (Kinshasa)', 'Republic of Congo', 'Congo'),
 ('Congo (Kinshasa)', 'Congo (Brazzaville)', 'Congo ('),
 ('Republic of 

In [27]:
df3[df3.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,short_name,iso_code
0,Albania,Europe & Central Asia,,c,PF,Albania,AL


In [28]:
df3 = df3.drop(columns=['iso_code', 'short_name'])

### + P5D

In [31]:
set(df3.columns).intersection(p5d.columns)

{'country', 'year'}

In [32]:
country_mapping = {
    'The Bahamas': 'Bahamas',
    'Cote d’Ivoire': "Cote D'Ivoire",
    "Côte d’Ivoire": "Cote D'Ivoire",
    'Republic of Congo': 'Congo Brazzaville',
    'Congo (Kinshasa)': 'Congo Kinshasa',
    'Kyrgyz Republic': 'Kyrgyzstan',
    'South Korea': 'Korea South',
    'North Korea': 'Korea North',
    'The Netherlands': 'Netherlands',
    'The Philippines': 'Philippines',
    'São Tomé and Príncipe': 'Sao Tome and Principe',
    'Timor-Leste': 'Timor Leste',
    'Eswatini': 'Swaziland',
    'USSR': 'Russia',
    'Myanmar': 'Myanmar (Burma)',
    'Vietnam North': 'Vietnam',
    'Vietnam South': 'Vietnam',
    'South Vietnam': 'Vietnam',
    'Yemen North': 'Yemen',
    'Sudan-North': 'Sudan',
    'Bosnia': 'Bosnia and Herzegovina',
    'Congo-Brazzaville': 'Congo Brazzaville',
    'Macedonia': 'North Macedonia',
    'Germany West': 'Germany',
    'Gambia, The': 'Gambia',
    'Yemen South': 'Yemen',
    'Russian Federation': 'Russia'
}

df3['country'] = df3['country'].replace(country_mapping)
p5d['country'] = p5d['country'].replace(country_mapping)

df4 = pd.merge(df3, p5d, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df4['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'Sao Tome and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Congo Kinshasa', 'Congo Brazzaville', 'Congo '),
 ('Congo Kinshasa', 'Congo (Brazzaville)', 'Congo '),
 ('Congo Brazzaville', 'Congo (Brazzaville)', 'Brazzaville'),
 ("Côte d'Ivoire", "Cote D'Ivoire", "'Ivoire"),
 ('Czech Republic', 'Dominican Repub

In [37]:
df4[df4.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,scode
0,Albania,Europe & Central Asia,,c,PF,


In [38]:
df4 = df4.drop(columns='scode')

### + PTS

In [39]:
set(df4.columns).intersection(pts.columns)

{'country', 'region', 'year'}

In [40]:
pts = pts.drop(columns='region')

In [45]:
country_mapping = {
    'Bolivia, Plurinational State of': 'Bolivia',
    "Cote d'Ivoire": "Cote D'Ivoire",
    'Republic of Congo': 'Congo Brazzaville',
    'Congo, the Democratic Republic of the': 'Congo Kinshasa',
    'Iran, Islamic Republic of': 'Iran',
    "Korea, Democratic People's Republic of": 'Korea North',
    'Korea, Republic of': 'Korea South',
    "Lao People's Democratic Republic": 'Laos',
    'Micronesia, Federated States of': 'Micronesia',
    'Moldova, Republic of': 'Moldova',
    'Myanmar': 'Myanmar (Burma)',
    'Sao Tome and Principe': 'Sao Tome and Principe',
    'Syrian Arab Republic': 'Syria',
    'Taiwan, Province of China': 'Taiwan',
    'Tanzania, United Republic of': 'Tanzania',
    'Venezuela, Bolivarian Republic of': 'Venezuela',
    'Viet Nam': 'Vietnam',
    'Western Sahara': 'Western Sahara',
    'Yemen Arab Republic': 'Yemen',
    "Yemen People's Republic": 'Yemen',
    'Yugoslavia, Federal Republic of': 'Yugoslavia',
    'Yugoslavia, Socialist Federal Republic of': 'Yugoslavia',
    'Swaziland': 'Eswatini',
    'Gambia': 'Gambia, The',
    'German Democratic Republic': 'Germany East',
    'German East': 'Germany East',
    'Ivory Coast': "Cote D'Ivoire",
    'UAE': 'United Arab Emirates',
    'Gambia, The': 'Gambia'
}

df4['country'] = df4['country'].replace(country_mapping)
pts['country'] = pts['country'].replace(country_mapping)

df5 = pd.merge(df4, pts, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df5['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'Sao Tome and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Saint Kitts and Nevis', ' and '),
 ('Cape Verde', 'Cabo Verde', ' Verde'),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Central African Republic', 'German Federal Republic', ' Republic'),
 ('Congo Kinshasa', 'Congo Brazzaville', 'Congo '),
 ('Congo Kinshasa', 'Cong

In [46]:
df5[df5.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,country_old,cow_code_a,wordbank_code_a
0,Albania,Europe & Central Asia,,c,PF,Albania,ALB,ALB


In [47]:
df5 = df5.drop(columns=['cow_code_a', 'country_old', 'wordbank_code_a'])

### + WGI

In [49]:
set(df5.columns).intersection(wgi.columns)

{'country', 'year'}

In [50]:
wgi['year'] = wgi['year'].astype(int)

In [51]:
country_mapping = {
    'Bahamas': 'Bahamas, The',
    "Cote D'Ivoire": "Côte d'Ivoire",
    'Cape Verde': 'Cabo Verde',
    'Congo Kinshasa': 'Congo, Dem. Rep.',
    'Congo Brazzaville': 'Congo, Rep.',
    'Korea South': 'Korea, Rep.',
    'Korea North': 'Korea, Dem. Rep.',
    'Laos': 'Lao PDR',
    'Micronesia': 'Micronesia, Fed. Sts.',
    'Myanmar (Burma)': 'Myanmar',
    'Sao Tome and Principe': 'São Tomé and Principe',
    'Swaziland': 'Eswatini',
    'Ivory Coast': "Côte d'Ivoire",
    'UAE': 'United Arab Emirates',
    'Taiwan': 'Taiwan, China',
    'Russia': 'Russian Federation',
    'Yemen': 'Yemen, Rep.',
    'Venezuela': 'Venezuela, RB',
    'Türkiye': 'Turkey',
    'Egypt': 'Egypt, Arab Rep.',
    'Gambia': 'Gambia, The',
    'Hong Kong': 'Hong Kong SAR, China',
    'Kyrgyzstan': 'Kyrgyz Republic',
    'Syria': 'Syrian Arab Republic',
    'Saint Lucia': 'St. Lucia',
    'Timor Leste': 'Timor-Leste'
}

df5['country'] = df5['country'].replace(country_mapping)
wgi['country'] = wgi['country'].replace(country_mapping)

df6 = pd.merge(df5, wgi, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df6['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bahamas, The', 'Gambia, The', ', The'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'Saint Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'São Tomé and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Saint Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'St. Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'West Bank and Gaza', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Kyrgyz Republic', ' Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Centra

In [52]:
df6[df6.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,code
0,Albania,Europe & Central Asia,,c,PF,ALB


In [53]:
df6 = df6.drop(columns='code')

### + WB

In [54]:
set(df6.columns).intersection(wb.columns)

{'country', 'year'}

In [55]:
country_groups = [
    '(UN) Africa', '(UN) Asia', '(UN) Europe', '(UN) Latin America and the Caribbean', 
    '(UN) North America', '(SDG) Central/Southern Asia', '(SDG) Eastern/South Eastern Asia', 
    '(SDG) Europe', '(SDG) Latin America & the Caribbean', '(SDG) Northern America', 
    '(SDG) Sub-Saharan Africa', '(SDG) Western Asia/Northern Africa',
    'Africa Eastern and Southern', 'Africa Western and Central',
    'Association of Southeast Asian Nations (ASEAN-5)',
    'Central Europe and the Baltics', 'Caribbean small states',
    'East Asia & Pacific (excluding high income)', 'East Asia & Pacific',
    'Europe & Central Asia (excluding high income)', 'Europe & Central Asia',
    'Emerging and Developing Europe', 'Euro area',
    'Europe & Central Asia (IDA & IBRD countries)',
    'Latin America & Caribbean (excluding high income)', 'Latin America & Caribbean',
    'Least developed countries: UN classification', 'Middle East & North Africa',
    'Middle East & Central Asia', 'Pacific island small states',
    'South Asia (IDA & IBRD)', 'Small states',
    'Sub-Saharan Africa (excluding high income)',
    'Sub-Saharan Africa (IDA & IBRD countries)',
    'Upper middle income', 'IBRD only', 'IDA & IBRD total', 'IDA total', 'IDA blend',
    'OECD members', 'OECD Member Countries',
    'Other Advanced Economies', 'Other small states',
    'Heavily indebted poor countries (HIPC)', 'High income', 'Low income', 'Lower middle income',
    'Low & middle income', 'Post-demographic dividend', 'Pre-demographic dividend',
    'Fragile and conflict affected situations', 'East Asia & Pacific (IDA & IBRD countries)',
    'Latin America & the Caribbean (IDA & IBRD countries)',
    'Middle East & North Africa (IDA & IBRD countries)', '(WHO) Africa Region',
    '(WHO) America Region', '(WHO) South-East Asia Region',
    '(WHO) Eastern Mediterranean Region', '(WHO) European Region',
    '(WHO) Western Pacific Region', 'Unknown or unspecified', 'Arab World',
    'Developed Asia', 'Early-demographic dividend', 'FAO (Food and Agriculture Organization)',
    'IDA only', 'Late-demographic dividend', 'Middle income',
    'Middle East & North Africa (excluding high income)',
    'World Bank Group', 'Western Europe', 'World',
    'European Union', 'Euro Area', 'Sub-Saharan Africa', 'North America',
    'South Asia']

wb = wb[~wb['country'].isin(country_groups)].reset_index(drop=True)

In [56]:
country_mapping = {
    'Bahamas, The': 'Bahamas',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Cabo Verde': 'Cape Verde',
    'Congo, Dem. Rep.': "Congo, Dem. People's Rep.",
    'Congo, Rep.': 'Congo, Rep.',
    'Korea, Rep.': 'Korea, South',
    'Korea, Dem. Rep.': "Korea, Dem. People's Rep.",
    'Lao PDR': 'Laos',
    'Micronesia, Fed. Sts.': 'Micronesia',
    'Myanmar': 'Burma',
    'São Tomé and Principe': 'Sao Tome and Principe',
    'Eswatini': 'Swaziland',
    'Taiwan, China': 'Taiwan',
    'Venezuela, RB': 'Venezuela',
    'Turkiye': 'Turkey',
    'Vietnam': 'Viet Nam',
    'Yemen, Rep.': 'Yemen',
    'Macao SAR, China': 'Macau',
    'West Bank and Gaza': 'Palestine',
    'Virgin Islands (U.S.)': 'United States Virgin Islands',
    'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
    'Netherlands Antilles (former)': 'Netherlands Antilles',
    'Réunion': 'Reunion',
    'Palestinian Authority Administered Territories': 'Palestine',
    'Palestinian Authority-Administered Territories': 'Palestine'
}

df6['country'] = df6['country'].replace(country_mapping)
wb['country'] = wb['country'].replace(country_mapping)

df7 = pd.merge(df6, wb, on=['country', 'year'], how='outer')

def check_countries(column, min_length=5):
    similar_pairs = []
    for i in range(len(column)):
        for j in range(i + 1, len(column)):
            if column[i] != column[j]:  # Ensure that identical countries are excluded
                seq = difflib.SequenceMatcher(None, column[i], column[j])
                match = seq.find_longest_match(0, len(column[i]), 0, len(column[j]))
                if match.size >= min_length:
                    similar_pairs.append((column[i], column[j], column[i][match.a: match.a + match.size]))
    return similar_pairs

check_countries(df7['country'].unique())

[('Algeria', 'Nigeria', 'geria'),
 ('Australia', 'Austria', 'Austr'),
 ('Austria', 'Transnistria', 'stria'),
 ('Bosnia and Herzegovina', 'Trinidad and Tobago', ' and '),
 ('Bosnia and Herzegovina', 'St. Vincent and the Grenadines', ' and '),
 ('Bosnia and Herzegovina', 'Sao Tome and Principe', ' and '),
 ('Bosnia and Herzegovina', 'Antigua and Barbuda', 'a and '),
 ('Bosnia and Herzegovina', 'Serbia and Montenegro', 'ia and '),
 ('Bosnia and Herzegovina', 'St. Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Saint Kitts and Nevis', ' and '),
 ('Bosnia and Herzegovina', 'Turks and Caicos Islands', ' and '),
 ('Central African Republic', 'Czech Republic', ' Republic'),
 ('Central African Republic', 'Dominican Republic', 'ican Republic'),
 ('Central African Republic', 'Kyrgyz Republic', ' Republic'),
 ('Central African Republic', 'Slovak Republic', ' Republic'),
 ('Central African Republic', 'South Africa', ' Africa'),
 ('Central African Republic', 'Syrian Arab Republic', ' Republ

In [57]:
df7[df7.select_dtypes(exclude=['number']).columns.tolist()].head(1)

Unnamed: 0,country,world_bank_region,region,c/t?,status,country_code
0,Albania,Europe & Central Asia,,c,PF,ALB


In [58]:
df7 = df7.drop(columns='country_code')

In [59]:
merged = df7.copy()
merged.to_pickle("smerged.dat")

### Target

In [None]:
df[['country', 'year', 'NY.GDP.PCAP.CD', 'NY.GDP.PCAP.CD_fill', 'GDP5', 'GDP5_2cat']]

Unnamed: 0,country,year,NY.GDP.PCAP.CD,NY.GDP.PCAP.CD_fill,GDP5,GDP5_2cat
21,Afghanistan,1960,60.950364,1962.0,15.0,1
22,Afghanistan,1961,60.950364,1962.0,27.0,1
23,Afghanistan,1962,60.950364,1962.0,35.0,1
24,Afghanistan,1963,82.021738,1963.0,13.0,1
25,Afghanistan,1964,85.511073,1964.0,11.0,1
...,...,...,...,...,...,...
15808,Zimbabwe,2013,1408.367810,2013.0,12.0,1
15809,Zimbabwe,2014,1407.034291,2014.0,0.0,0
15810,Zimbabwe,2015,1410.329173,2015.0,-1.0,0
15811,Zimbabwe,2016,1421.787791,2016.0,5.0,1


### Train and test set

In [None]:
from sklearn.model_selection import train_test_split

dfst = pd.read_pickle("saved/ready.dat")
past_observation_count = 5
target_variable = 'GDP5_cat'
columns_to_exclude = ['GDP5_2cat', 'GDP5_cat', 'year_str', 'country', 'GDP5', 'region', 'c/t?', 'status']

target_array = []
explanatory_array = []

grouped = dfst.groupby('country')

for _, group_df in grouped:
    target_values = group_df[target_variable].values
    explanatory_values = group_df.drop(columns=columns_to_exclude).values
    for i in range(past_observation_count, len(target_values)):
        target_array.append(target_values[i])
        explanatory_array.append(explanatory_values[i - past_observation_count:i])

X = np.array(explanatory_array, dtype=np.float32)
y = np.array(target_array, dtype=np.int32)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=36)

def relu(x):
    return np.clip(x,0,np.inf)
def softmax(x):
    return [np.exp(x[i])/np.sum(np.exp(x)) for i in range(len(x))]

### Run model
Is it correct that I have 10-90 validation split and 5-95 train-test split

In [None]:
from keras.layers import LSTM
from tensorflow.keras import datasets, layers, models
from tensorflow import keras

model = keras.Sequential([
    keras.layers.LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    keras.layers.LSTM(50, return_sequences=False),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(17, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=300, batch_size=100, validation_split=0.1, callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)])
test_loss, test_acc = model.evaluate(X_test, y_test)
print('\nTest accuracy:', test_acc)

2025-02-09 18:03:38.552487: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-09 18:03:38.918876: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-09 18:03:39.439476: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/300


2025-02-09 18:04:01.210472: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 8913509640 exceeds 10% of free system memory.


[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 6ms/step - accuracy: 0.6992 - loss: 0.7166 - val_accuracy: 0.8665 - val_loss: 0.3221
Epoch 2/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.8850 - loss: 0.2845 - val_accuracy: 0.9122 - val_loss: 0.2136
Epoch 3/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9190 - loss: 0.2046 - val_accuracy: 0.9353 - val_loss: 0.1735
Epoch 4/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9363 - loss: 0.1668 - val_accuracy: 0.9410 - val_loss: 0.1531
Epoch 5/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9445 - loss: 0.1478 - val_accuracy: 0.9492 - val_loss: 0.1319
Epoch 6/300
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.9498 - loss: 0.1339 - val_accuracy: 0.9525 - val_loss: 0.1288
Epoch 7/300
[1m