In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# Load masterdata
df = pd.read_csv('src/output/masterdata.csv')

# Cultural dimension columns to impute
cultural_dims = ['pdi', 'idv', 'mas', 'uai', 'lto', 'ivr']

print(f"Dataset shape: {df.shape}")
missing_counts = df[cultural_dims].isnull().sum()
print(f"Missing values:\n{missing_counts[missing_counts > 0]}")

Dataset shape: (116, 33)
Missing values:
lto    10
ivr    20
dtype: int64


In [4]:
# Step 1: Regional mean imputation for initial filling
def regional_mean_imputation(data, group_cols, target_cols):
    result = data.copy()
    
    for col in target_cols:
        # Calculate regional means
        regional_means = result.groupby(group_cols)[col].mean()
        
        # Fill missing values with regional means
        for group, mean_val in regional_means.items():
            if pd.notna(mean_val):
                if len(group_cols) == 1:
                    mask = (result[group_cols[0]] == group) & (result[col].isna())
                else:
                    mask = (result[group_cols[0]] == group[0]) & (result[group_cols[1]] == group[1]) & (result[col].isna())
                result.loc[mask, col] = float(mean_val)
    
    return result

# Apply regional mean imputation (continent + region)
df_regional = regional_mean_imputation(df, ['continent', 'region'], cultural_dims)

# If still missing, use continent-level means
df_continental = regional_mean_imputation(df_regional, ['continent'], cultural_dims)

print(f"After regional imputation: {df_continental[cultural_dims].isnull().sum().sum()} missing values")

After regional imputation: 0 missing values


  result.loc[mask, col] = float(mean_val)
  result.loc[mask, col] = float(mean_val)
  result.loc[mask, col] = float(mean_val)
  result.loc[mask, col] = float(mean_val)


In [5]:
# Step 2: Iterative Imputation for refinement
# Prepare features for iterative imputation
year_cols = [col for col in df.columns if col.isdigit() or col.endswith('_male') or col.endswith('_female')]
feature_cols = cultural_dims + year_cols

# Select relevant columns and create feature matrix
imputation_data = df_continental[feature_cols].copy()

# Apply iterative imputer
iterative_imputer = IterativeImputer(
    estimator=None,  # Uses BayesianRidge by default
    max_iter=10,
    random_state=42,
    initial_strategy='mean'
)

# Fit and transform
imputed_values = iterative_imputer.fit_transform(imputation_data)
imputed_df = pd.DataFrame(imputed_values, columns=feature_cols, index=df.index)

# Replace cultural dimensions in original dataframe
df_final = df.copy()
for col in cultural_dims:
    df_final[col] = imputed_df[col]

print(f"After iterative imputation: {df_final[cultural_dims].isnull().sum().sum()} missing values")

After iterative imputation: 0 missing values


In [6]:
# Round cultural dimensions to reasonable precision
for col in cultural_dims:
    if col in ['lto', 'ivr']:
        df_final[col] = df_final[col].round(1)
    else:
        df_final[col] = df_final[col].round(0).astype(int)

# Save updated dataset
df_final.to_csv('src/output/masterdata.csv', index=False, quoting=0)
print(f"Updated masterdata saved with {df_final.shape[0]} countries and {df_final.shape[1]} columns")

Updated masterdata saved with 116 countries and 33 columns


In [None]:
# Verification
print(f"Final dataset: {df_final.shape}")
print(f"Missing values: {df_final[cultural_dims].isnull().sum().sum()}")
print("âœ“ Data imputation completed successfully")