In [1]:
import os
import pandas as pd
import unicodedata

folder_path = "/Users/kim/Desktop/repos/Philippines_Visitor/cleanedData"
set_countries = set()

def normalize_country(name):
    # Convert to string, strip whitespace, and normalize Unicode (for invisible chars)
    name = str(name).strip()
    name = unicodedata.normalize("NFKC", name)
    # Replace multiple spaces with one
    name = " ".join(name.split())
    # Convert to title case (so DENMARK, Denmark, denmark → Denmark)
    name = name.title()
    return name

for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        df.columns = df.columns.str.strip()
        if 'Country' not in df.columns:
            continue

        countries = df['Country'].dropna().apply(normalize_country)
        set_countries.update(countries)

unique_countries = sorted(set_countries)
print(f"Found {len(unique_countries)} unique countries:")
print(unique_countries)
# Create DataFrame
unique_df = pd.DataFrame(unique_countries, columns=['Country'])

# Save as CSV
output_path = os.path.join(folder_path, "unique_countries.csv")


Found 283 unique countries:
['* Overseas Filipinos', '*Overseas Filipinos', 'Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua And Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire, Sint Eustatius And Saba', 'Bosnia', 'Bosnia And Herzegovina', 'Botswana', 'Brazil', 'British Indian Ocean Territory', 'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo, Democratic Republic Of', 'Congo, Republic Of (Brazzaville)', 'Congo, Republic Of (Cap. Brazzaville)', 'Cook Islands', 'Costa Rica', "Cote D'Ivoire (Ivory Coast)", 'Croatia', 'Cuba', 'Curaca

In [2]:
non_countries = [
    "* Overseas Filipinos",
    "*Overseas Filipinos",
    "Foreign Tourists",
    "Grand Total",
    "Independent States",
    "Oman****",
    "Overseas Filipino*",
    "Overseas Filipinos*",
    "Overseas Filipinos***",
    "Residences",
    "T O T A L",
    "Total (Cis & Russia)",
    "Total Foreign",
    "Total Foreign Tourist",
    "Total Overseas Filipinos *"
]

# Drop rows where Country matches any of the above
unique_df = unique_df[~unique_df['Country'].isin(non_countries)].reset_index(drop=True)

In [3]:
country_replacements = {
    # Congo variants
    "Congo, Democratic Republic Of": "Democratic Republic of the Congo",
    "Congo, Republic Of (Brazzaville)": "Republic of the Congo",
    "Congo, Republic Of (Cap. Brazzaville)": "Republic of the Congo",

    # Eswatini / Swaziland
    'Eswatini (Fmr. "Swaziland")': "Eswatini",

    # Ivory Coast
    "Ivory Coast": "Côte d'Ivoire",
    "Cote D'Ivoire (Ivory Coast)": "Côte d'Ivoire",

    # Russia
    "Russian Federation": "Russia",

    # Macedonia
    "Macedonia (The Former Yugoslav Republic Of)": "North Macedonia",

    # Macau
    "Macau - Sar": "Macau",

    # Hong Kong
    "Hong Kong - Sar": "Hong Kong",
    "Hongkong": "Hong Kong",

    # US Virgin Islands
    "Us Virgin Islands": "United States Virgin Islands",
    "Virgin Islands (U.S.)": "United States Virgin Islands",

    # USA
    "Usa": "United States of America",

    # Saint Barthélemy typo
    "Saint Barthã©Lemy": "Saint Barthélemy",

    '"Bonaire, Sint Eustatius And Saba"': 'Bonaire, Sint Eustatius And Saba'

}

unique_df['Country'] = unique_df['Country'].replace(country_replacements)

In [4]:
unique_df.dtypes

Country    object
dtype: object

In [5]:
unique_countries = sorted(unique_df['Country'].unique())
unique_countries

['Afghanistan',
 'Aland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua And Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire, Sint Eustatius And Saba',
 'Bosnia',
 'Bosnia And Herzegovina',
 'Botswana',
 'Brazil',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czech Republic',
 "Côte d'Ivoire",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',

In [None]:
unique_countries_df = pd.DataFrame(unique_countries, columns=['Country'])
unique_countries_df.to_csv('unique_countries.csv', index=False)