In [4]:
# clean_happiness_data.py
import pandas as pd
import numpy as np
from pathlib import Path

def load_and_clean(file_path):
    """Load and standardize a single year's data"""
    df = pd.read_csv(file_path)
    
    # Column renaming (handles all year formats)
    rename_dict = {
        'Country': 'country',
        'Country or region': 'country',
        'Region': 'region',
        'Happiness Rank': 'rank',
        'Happiness.Rank': 'rank',
        'Overall rank': 'rank',
        'Happiness Score': 'happiness_score',
        'Happiness.Score': 'happiness_score',
        'Score': 'happiness_score',
        'Economy (GDP per Capita)': 'gdp',
        'Economy..GDP.per.Capita.': 'gdp',
        'GDP per capita': 'gdp',
        'Family': 'social_support',
        'Social support': 'social_support',
        'Health (Life Expectancy)': 'life_expectancy',
        'Health..Life.Expectancy.': 'life_expectancy',
        'Healthy life expectancy': 'life_expectancy',
        'Freedom': 'freedom',
        'Freedom to make life choices': 'freedom',
        'Trust (Government Corruption)': 'corruption',
        'Trust..Government.Corruption.': 'corruption',
        'Perceptions of corruption': 'corruption',
        'Generosity': 'generosity',
        'Dystopia Residual': 'dystopia_residual',
        'Dystopia.Residual': 'dystopia_residual'
    }
    df = df.rename(columns={k:v for k,v in rename_dict.items() if k in df.columns})
    
    # Add year from filename
    df['year'] = int(Path(file_path).stem[:4])
    
    return df

def main(input_dir="raw_data", output_dir="cleaned_data"):
    # Load all files
    files = list(Path(input_dir).glob("*.csv"))
    dfs = [load_and_clean(f) for f in files]
    
    # Backfill regions from 2015 data
    region_lookup = dfs[0].set_index('country')['region']
    for df in dfs[1:]:
        df['region'] = df['country'].map(region_lookup)
    
    # Standardize country names
    country_fixes = {
        'Congo (Brazzaville)': 'Republic of the Congo',
        'Congo (Kinshasa)': 'Democratic Republic of the Congo',
        'Palestinian Territories': 'Palestine',
        'Taiwan Province of China': 'Taiwan'
    }
    for df in dfs:
        df['country'] = df['country'].replace(country_fixes)
    
    # Handle dystopia residual (pre-2018)
    for df in dfs:
        if 'dystopia_residual' in df.columns:
            df['happiness_score'] = df['happiness_score'] - df['dystopia_residual']
    
    # Combine and save
    final_cols = ['country','region','year','happiness_score',
                 'gdp','social_support','life_expectancy',
                 'freedom','corruption','generosity']
    pd.concat(dfs)[final_cols].to_csv(f"{output_dir}/happiness_cleaned.csv", index=False)

if __name__ == "__main__":
    main()