# Merge Migration Data with Hofstede Cultural Dimensions

In [1]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

In [2]:
# Load datasets
migration_df = pd.read_csv('src/output/migration_data_processed.csv')
hofstede_df = pd.read_csv('src/raw_data/hofstede_country_scores.csv')

print(f"Migration data: {migration_df.shape[0]} countries")
print(f"Hofstede data: {hofstede_df.shape[0]} countries")

Migration data: 232 countries
Hofstede data: 119 countries


In [4]:
# Country name similarity function
def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# Create country name mapping
def find_best_match(migration_country, hofstede_countries, threshold=0.85):  # Increased threshold
    best_match = None
    best_score = 0
    
    for hof_country in hofstede_countries:
        score = similarity(migration_country, hof_country)
        if score > best_score and score >= threshold:
            best_score = score
            best_match = hof_country
    
    return best_match, best_score

# Analyze country name matches
migration_countries = migration_df['country'].tolist()
hofstede_countries = hofstede_df['country'].tolist()

# Perform country matching
matches = []
unmatched_migration = []
unmatched_hofstede = hofstede_countries.copy()

for mig_country in migration_countries:
    best_match, score = find_best_match(mig_country, hofstede_countries)
    
    if best_match:
        matches.append({
            'migration_name': mig_country,
            'hofstede_name': best_match,
            'similarity_score': score
        })
        if best_match in unmatched_hofstede:
            unmatched_hofstede.remove(best_match)
    else:
        unmatched_migration.append(mig_country)

print(f"Automatic matches: {len(matches)}")
print(f"Manual mapping required: {len(unmatched_migration)}")

Automatic matches: 106
Manual mapping required: 126


In [None]:
# Manual country mapping for unmatched countries
manual_mappings = {
    "Iran (Islamic Republic of)": "Iran",
    "United States of America*": "United states",
    "United Kingdom*": "United kingdom", 
    "Republic of Korea": "South korea",
    "Bosnia and Herzegovina": "Bosnia and herzegovina",
    "Czechia": "Czech republic",
    "Dominican Republic": "Dominican republic",
    "New Zealand*": "New zealand",
    "North Macedonia": "North macedonia",
    "Russian Federation": "Russia",
    "Republic of Moldova*": "Moldova",
    "São Tomé and Príncipe": "São tomé and príncipe",
    "Saudi Arabia": "Saudi arabia",
    "Sierra Leone": "Sierra leone",
    "Slovakia*": "Slovakia",
    "South Africa": "South africa",
    "Sri Lanka": "Sri lanka",
    "Trinidad and Tobago": "Trinidad and tobago",
    "United Arab Emirates": "United arab emirates",
    "Viet Nam": "Vietnam",
    "China, Taiwan Province of China": "Taiwan",
    "United Republic of Tanzania": "Tanzania",
    "Bolivia (Plurinational State of)": "Bolivia",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Syrian Arab Republic": "Syria",
    "Türkiye": "Turkey",
    "China, Hong Kong SAR": "Hong kong",
    
    # Countries without Hofstede data (excluded)
    "Eritrea": None, "Rwanda": None, "Somalia": None, "South Sudan": None, "Uganda": None,
    "Chad": None, "Congo": None, "Gabon": None, "Sudan": None, "Eswatini": None,
    "Guinea": None, "Liberia": None, "Mauritania": None, "Kyrgyzstan": None, "Tajikistan": None,
    "Turkmenistan": None, "Uzbekistan": None, "Afghanistan": None, "Cambodia": None, "Myanmar": None,
    "Bahrain": None, "Cyprus": None, "Oman": None, "Andorra": None, "Monaco": None,
    "Anguilla*": None, "Bahamas": None, "Dominica": None, "Grenada": None,
    "United States Virgin Islands*": None, "Belize": None, "Nicaragua": None, "Guyana": None,
    "Greenland*": None, "Australia/New Zealand": None, "New Caledonia*": None, "Palau": None,
    "Tonga": None, "Dem. People's Republic of Korea": None, "Burundi": None, "Comoros": None,
    "Djibouti": None, "Madagascar": None, "Mauritius": None, "Mayotte": None, "Réunion": None,
    "Seychelles": None, "Zimbabwe": None,
}

# Create final mapping
country_mapping = {match['migration_name']: match['hofstede_name'] for match in matches}

# Apply manual mappings
for mig_country, hof_country in manual_mappings.items():
    if hof_country is not None:
        if mig_country in unmatched_migration:
            unmatched_migration.remove(mig_country)
        
        existing_match = next((m for m in matches if m['migration_name'] == mig_country), None)
        if existing_match:
            existing_match['hofstede_name'] = hof_country
            existing_match['similarity_score'] = 1.0
        else:
            matches.append({
                'migration_name': mig_country,
                'hofstede_name': hof_country,
                'similarity_score': 1.0
            })

country_mapping = {match['migration_name']: match['hofstede_name'] for match in matches}
excluded_countries = [c for c in migration_countries if c not in country_mapping]

print(f"Countries matched: {len(country_mapping)}")
print(f"Countries excluded: {len(excluded_countries)}")

Countries matched: 121
Countries excluded: 113


In [None]:
# Finalize country mapping
country_mapping = {match['migration_name']: match['hofstede_name'] for match in matches}
excluded_countries = [c for c in migration_countries if c not in country_mapping]

print(f"Countries to include: {len(country_mapping)}")
print(f"Countries excluded: {len(excluded_countries)}")

Total countries to be included: 121
Countries excluded (no Hofstede data): 113


In [None]:
# Merge datasets
filtered_migration = migration_df[migration_df['country'].isin(country_mapping.keys())].copy()
reverse_mapping = {hof_name: mig_name for mig_name, hof_name in country_mapping.items()}

hofstede_with_migration_names = hofstede_df.copy()
hofstede_with_migration_names['migration_country'] = hofstede_with_migration_names['country'].map(reverse_mapping)
hofstede_filtered = hofstede_with_migration_names[hofstede_with_migration_names['migration_country'].notna()].copy()

merged_df = pd.merge(
    filtered_migration,
    hofstede_filtered[['migration_country', 'pdi', 'idv', 'mas', 'uai', 'lto', 'ivr']],
    left_on='country',
    right_on='migration_country',
    how='inner'
)

merged_df = merged_df.drop('migration_country', axis=1)

# Reorder columns
base_columns = ['country', 'continent', 'region']
hofstede_columns = ['pdi', 'idv', 'mas', 'uai', 'lto', 'ivr']
year_columns = [col for col in merged_df.columns if col not in base_columns + hofstede_columns]

final_columns = base_columns + hofstede_columns + year_columns
masterdata_df = merged_df[final_columns]

print(f"Merged dataset: {masterdata_df.shape[0]} countries × {masterdata_df.shape[1]} columns")

Merged dataset: 116 countries × 33 columns


In [None]:
# Export masterdata
import os

output_dir = "src/output"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "masterdata.csv")
masterdata_df.to_csv(output_path, index=False, quoting=1)

print(f"Exported masterdata: {masterdata_df.shape[0]} countries × {masterdata_df.shape[1]} columns")

Shape: 116 countries × 33 columns


In [None]:
# Validate key countries
key_countries = ["Iran", "United States of America", "Germany", "China", "Australia"]
found = sum(1 for country in key_countries if country in masterdata_df['country'].values)

print(f"Validation: {found}/{len(key_countries)} key countries found")
print(f"Final dataset: {len(masterdata_df)} countries")

Validation complete:
❌ Iran
❌ United States of America
✅ Germany
✅ China
❌ Australia

Final dataset: 116 countries


In [None]:
import re

def clean_country_name(name):
    """Clean country names by removing asterisks and content in parentheses"""
    name = name.replace('*', '')
    name = re.sub(r'\s*\([^)]*\)', '', name)
    name = name.strip()
    return name

# Apply country name cleaning
masterdata_df['country'] = masterdata_df['country'].apply(clean_country_name)

# Convert numerical columns to proper types
numerical_columns = ['pdi', 'idv', 'mas', 'uai'] + [col for col in masterdata_df.columns if col.startswith(('199', '200', '201', '202'))]

for col in numerical_columns:
    if col in masterdata_df.columns:
        masterdata_df[col] = pd.to_numeric(masterdata_df[col], errors='coerce')
        if col in ['pdi', 'idv', 'mas', 'uai']:
            masterdata_df[col] = masterdata_df[col].astype('Int64')
        else:
            masterdata_df[col] = masterdata_df[col].fillna(0).astype(int)

# Handle lto and ivr columns (preserve NaN values)
for col in ['lto', 'ivr']:
    if col in masterdata_df.columns:
        masterdata_df[col] = pd.to_numeric(masterdata_df[col], errors='coerce')

print(f"Country names cleaned. Shape: {masterdata_df.shape}")

FIXING COUNTRY NAMES AND CSV FORMAT
Countries being cleaned:
  Iran (Islamic Republic of) -> Iran
  Republic of Moldova* -> Republic of Moldova
  Ukraine* -> Ukraine
  Denmark* -> Denmark
  Finland* -> Finland
  Norway* -> Norway
  United Kingdom* -> United Kingdom
  Serbia* -> Serbia
  Spain* -> Spain
  France* -> France
  Netherlands* -> Netherlands
  Puerto Rico* -> Puerto Rico
  Bolivia (Plurinational State of) -> Bolivia
  Venezuela (Bolivarian Republic of) -> Venezuela
  United States of America* -> United States of America
  Australia* -> Australia
  New Zealand* -> New Zealand

Countries with commas: 2
  - China, Hong Kong SAR
  - China, Taiwan Province of China

Saved with minimal quoting (commas present)

Final masterdata:
Shape: (116, 33)

Sample of cleaned data:
                       country continent  pdi  idv     1990    2024
0                     Ethiopia    AFRICA   70    7   875325  400579
1                        Kenya    AFRICA   70    4   298089  563977
2          

In [None]:
# Apply specific country name fixes
specific_mappings = {
    "China, Hong Kong SAR": "Hong Kong",
    "China, Taiwan Province of China": "Taiwan",
    "United Republic of Tanzania": "Tanzania"
}

current_names = masterdata_df['country'].tolist()
final_names = []

for name in current_names:
    if name in specific_mappings:
        final_names.append(specific_mappings[name])
    else:
        final_names.append(name)

masterdata_df['country'] = final_names

# Save the final clean version
masterdata_df.to_csv('src/output/masterdata.csv', index=False, quoting=0)
print(f"Final masterdata saved. Shape: {masterdata_df.shape}")


Applying specific country name fixes:
  United Republic of Tanzania -> Tanzania
  China, Hong Kong SAR -> Hong Kong
  China, Taiwan Province of China -> Taiwan

Countries with commas after cleaning: 0

Final masterdata saved without quotes!
Shape: (116, 33)

Final sample:
      country continent  pdi  idv     1990    2024
0    Ethiopia    AFRICA   70    7   875325  400579
1       Kenya    AFRICA   70    4   298089  563977
2      Malawi    AFRICA   70   30  1127724  103639
3  Mozambique    AFRICA   85   15   122332  166284
4    Tanzania    AFRICA   70   25   574025  193950


In [None]:
# Fix string formatting - convert ALL CAPS to proper title case
def fix_string_formatting(text):
    """Convert string to proper title case."""
    if pd.isna(text) or not isinstance(text, str):
        return text
    
    fixed = text.title()
    
    special_words = {
        ' And ': ' and ',
        ' The ': ' the ',
        ' Of ': ' of ',
        ' In ': ' in ',
        ' To ': ' to ',
        ' For ': ' for ',
        ' With ': ' with ',
        ' At ': ' at ',
        ' By ': ' by ',
        ' From ': ' from ',
        ' Up ': ' up ',
        ' On ': ' on ',
        ' As ': ' as '
    }
    
    for old, new in special_words.items():
        fixed = fixed.replace(old, new)
    
    return fixed

# Apply formatting to continent and region columns
masterdata_df['continent'] = masterdata_df['continent'].apply(fix_string_formatting)
masterdata_df['region'] = masterdata_df['region'].apply(fix_string_formatting)

# Save the corrected version
masterdata_df.to_csv('src/output/masterdata.csv', index=False, quoting=0)
print(f"String formatting applied. Final dataset: {masterdata_df.shape[0]} countries × {masterdata_df.shape[1]} columns")

FIXING STRING FORMATTING
Original formatting examples:
Continents: ['Africa' 'Asia' 'Europe']
Regions: ['Eastern Africa' 'Middle Africa' 'Northern Africa']

Fixing continent names...

Fixing region names...

After formatting:
Continents: ['Africa', 'Asia', 'Europe', 'Latin America and the Caribbean', 'Northern America', 'Oceania']
Sample regions: ['Caribbean', 'Central America', 'Central Asia', 'Eastern Africa', 'Eastern Asia']

✓ String formatting corrected and saved!

Sample of corrected data:
      country continent          region
0    Ethiopia    Africa  Eastern Africa
1       Kenya    Africa  Eastern Africa
2      Malawi    Africa  Eastern Africa
3  Mozambique    Africa  Eastern Africa
4    Tanzania    Africa  Eastern Africa
