# 01 Data Cleaning




Phase 0:

Phase 1	
	1.	Compute within / between variance of ND-GAIN at annual frequency (total, between, within, share within). Save a small table and histogram of year-to-year changes (diff1) and report per-country within SD.
	2.	Count how many countries and how many country-years show substantive ND-GAIN change; identify outliers (big diffs).
	3.	Compute per-country summary (mean, SD, number of years). Produce a short paragraph interpreting whether FE is likely to be feasible.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('/Users/leosgambato/Documents/GitHub/Capstone/data/processed/baseline_with_gain.csv')

# Clean up: drop rows where 'gain' is missing or iso3c is missing
df = df.dropna(subset=['gain', 'iso3c','sovereign_spread'])

# 1. Compute within / between variance of Gain at annual frequency

# Compute overall mean
overall_mean = df['gain'].mean()

# Compute per-country mean
country_means = df.groupby('iso3c')['gain'].mean()

# Merge country means back to df
df = df.merge(country_means.rename('country_mean'), left_on='iso3c', right_index=True)

# Between variance: variance of country means
between_var = country_means.var(ddof=0)

# Within variance: mean of per-country variances
within_var = df.groupby('iso3c')['gain'].var(ddof=0).mean()

# Total variance
total_var = df['gain'].var(ddof=0)

# Share within
share_within = within_var / total_var if total_var > 0 else np.nan

print("Variance decomposition of ND-GAIN (annual):")
print(f"  Total variance:   {total_var:.4f}")
print(f"  Between variance: {between_var:.4f}")
print(f"  Within variance:  {within_var:.4f}")
print(f"  Share within:     {share_within:.2%}")

# 2. Histogram and table of year-to-year changes (diff1), per-country within SD

# Sort for diff calculation
df = df.sort_values(['iso3c', 'year'])

# Compute year-to-year difference
df['gain_diff1'] = df.groupby('iso3c')['gain'].diff()

# Table of year-to-year changes (drop NA)
diff1_table = df[['iso3c', 'year', 'gain_diff1']].dropna()

# Save small table (first 10 rows as example)
diff1_table.head(10).to_csv('/Users/leosgambato/Documents/GitHub/Capstone/outputs/gain_diff1_sample.csv', index=False)

# Histogram of all year-to-year changes
plt.figure(figsize=(8,5))
plt.hist(diff1_table['gain_diff1'], bins=30, edgecolor='k')
plt.title('Histogram of Year-to-Year Changes in ND-GAIN')
plt.xlabel('Year-to-Year Change (diff1)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('/Users/leosgambato/Documents/GitHub/Capstone/outputs/gain_diff1_hist.png')
plt.close()

# Per-country within SD
country_within_sd = df.groupby('iso3c')['gain'].std().rename('within_sd')
country_within_sd = country_within_sd.reset_index()
country_within_sd.head(10).to_csv('/Users/leosgambato/Documents/GitHub/Capstone/outputs/gain_within_sd_sample.csv', index=False)

# 2. Count countries and country-years with substantive ND-GAIN change; identify outliers

# Define substantive change threshold (e.g., > 1 SD of all diffs)
diff1_sd = diff1_table['gain_diff1'].std()
substantive_thresh = diff1_sd

# Count country-years with |diff1| > threshold
substantive_changes = diff1_table[np.abs(diff1_table['gain_diff1']) > substantive_thresh]
n_substantive = len(substantive_changes)
n_countries = substantive_changes['iso3c'].nunique()

print(f"\nNumber of country-years with substantive ND-GAIN change (>|{substantive_thresh:.2f}|): {n_substantive}")
print(f"Number of countries with at least one substantive change: {n_countries}")

# Identify outliers (e.g., top 5 biggest absolute diffs)
outliers = diff1_table.reindex(diff1_table['gain_diff1'].abs().sort_values(ascending=False).index).head(5)
print("\nTop 5 outlier year-to-year changes in ND-GAIN:")
print(outliers)

# 3. Per-country summary (mean, SD, number of years)
country_summary = df.groupby('iso3c').agg(
    mean_gain=('gain', 'mean'),
    sd_gain=('gain', 'std'),
    n_years=('gain', 'count')
).reset_index()

country_summary.head(10).to_csv('/Users/leosgambato/Documents/GitHub/Capstone/outputs/gain_country_summary_sample.csv', index=False)

# Short paragraph interpreting FE feasibility
print("\nInterpretation:")
print("The variance decomposition shows that a substantial share of the total variance in ND-GAIN is due to between-country differences, with within-country (over time) variance being relatively smaller. The per-country within SDs are generally low, and only a small number of country-years show substantive changes in ND-GAIN from year to year. This suggests that country fixed effects (FE) models may be feasible, but the limited within-country variation could reduce the power to detect effects of time-varying covariates. Outlier years with large changes should be checked for data quality or exceptional events.")


Variance decomposition of ND-GAIN (annual):
  Total variance:   116.0202
  Between variance: 114.1825
  Within variance:  4.5763
  Share within:     3.94%

Number of country-years with substantive ND-GAIN change (>|0.98|): 178
Number of countries with at least one substantive change: 57

Top 5 outlier year-to-year changes in ND-GAIN:
     iso3c  year  gain_diff1
2228   BGD  2014  -11.062157
2274   NGA  2014  -10.871986
2256   IND  2014   -9.909187
2278   PAK  2014   -8.339038
2270   MEX  2014   -7.692216

Interpretation:
The variance decomposition shows that a substantial share of the total variance in ND-GAIN is due to between-country differences, with within-country (over time) variance being relatively smaller. The per-country within SDs are generally low, and only a small number of country-years show substantive changes in ND-GAIN from year to year. This suggests that country fixed effects (FE) models may be feasible, but the limited within-country variation could reduce the power 

### Data cleaning, Exploratory analysis, creating lags etc. 


In [11]:
# Check for any remaining missing values
print(f"\nMissing values summary:")
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
if len(missing_summary) > 0:
    print(missing_summary)
else:
    print("No missing values in the dataset")

print("\nDataset ready for analysis!")



Missing values summary:
cpi_yoy                   267
wgi_cc                    144
wgi_ge                    144
wgi_pv                    144
wgi_rl                    144
wgi_rq                    144
wgi_va                    144
gain_diff1                 67
gdp_annual_growth_rate     66
debt_to_gdp                11
dtype: int64

Dataset ready for analysis!


In [12]:
# Required imports
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

In [13]:
# ensure year is integer
df['year'] = df['year'].astype(int)
df = df.sort_values(['iso3c','year']).reset_index(drop=True)

In [15]:
# Log transforms (avoid log(0) by replacing zeros with tiny positive number)
df['ln_gdp_per_capita'] = np.log(df['gdp_per_capita'].replace(0, np.nan))
df['ln_gross_gdp'] = np.log(df['gross gdp'].replace(0, np.nan))

# If you prefer to drop the raw gross_gdp or gdp_per_capita from covariates, decide later

In [16]:
def create_lags(df, group='iso3c', time='year', variables=None, lags=(1,2)):
    """
    Add lagged versions of `variables` grouped by `group` and sorted by `time`.
    variables: list of column names
    lags: tuple/list of lag integers e.g. (1,2)
    """
    df = df.sort_values([group, time]).copy()
    for var in variables:
        for lag in lags:
            new_col = f"{var}_lag{lag}"
            df[new_col] = df.groupby(group)[var].shift(lag)
    return df

# Variables we recommend lagging (one or two lags)
vars_to_lag = [
    'sovereign_spread',    # outcome lag Y_lag1
    'cpi_yoy',
    'gdp_annual_growth_rate',
    'ln_gdp_per_capita',
    'ln_gross_gdp',
    'debt_to_gdp',
    'deficit_to_gdp',
    'current_account_balance',
    # Add global factors if present (VIX, US10y, Brent)
    # 'VIX', 'US10y', 'Brent'
]

# add lag1 and lag2 (you can restrict to only lag1 by passing lags=(1,))
df = create_lags(df, group='iso3c', time='year', variables=vars_to_lag, lags=(1,2))

In [17]:
# ND-GAIN is 'gain' column
df = df.sort_values(['iso3c','year']).reset_index(drop=True)
df['gain_diff1'] = df.groupby('iso3c')['gain'].diff(1)
df['gain_diff3'] = df.groupby('iso3c')['gain'].diff(3)
df['gain_diff5'] = df.groupby('iso3c')['gain'].diff(5)

# outlier threshold (you used 0.91 earlier; keep parametric)
OUTLIER_THRESHOLD = 0.91
df['gain_substantive_change'] = df['gain_diff1'].abs() > OUTLIER_THRESHOLD

# list top K absolute diffs for quick inspection
def top_gain_changes(df, k=10):
    tmp = df[['iso3c','year','gain_diff1']].dropna().assign(absdiff= df['gain_diff1'].abs())
    return tmp.sort_values('absdiff', ascending=False).head(k)

# winsorize function
def winsorize_ser(s, lower_pct=0.01, upper_pct=0.99):
    lo = s.quantile(lower_pct)
    hi = s.quantile(upper_pct)
    return s.clip(lower=lo, upper=hi)

# Example: create winsorized diff for diagnostics (do not replace original unless you choose)
df['gain_diff1_wins'] = winsorize_ser(df['gain_diff1'], 0.01, 0.99)

#### 5) Imputation flags (per variable) — create columns marking missingness

In [18]:
# choose list of candidate covariates we'll eventually impute (lagged and level vars)
candidate_covs = [
    # lagged covariates
    'sovereign_spread_lag1', 'sovereign_spread_lag2',
    'cpi_yoy_lag1', 'cpi_yoy_lag2',
    'gdp_annual_growth_rate_lag1', 'gdp_annual_growth_rate_lag2',
    'ln_gdp_per_capita_lag1', 'ln_gdp_per_capita_lag2',
    'debt_to_gdp_lag1', 'debt_to_gdp_lag2',
    'deficit_to_gdp_lag1', 'deficit_to_gdp_lag2',
    'current_account_balance_lag1', 'current_account_balance_lag2',
    # governance (we treat as slow-moving; missing flags still useful)
    'wgi_cc','wgi_ge','wgi_pv','wgi_rl','wgi_rq','wgi_va',
    # treatment (do not impute treatment for FE logic) — we still examine missingness
    'gain'
]

# add flags
for col in candidate_covs:
    flag_col = col + '_impflag'
    df[flag_col] = df[col].isna().astype(int)

6) Build the baseline covariate lists to feed the DML nuisance learners

We create (A) a baseline set (lagged core variables + WGI + lagged outcome), and (B) an extended set including additional candidates.

In [19]:
# Baseline covariates for DML nuisance (lagged)
baseline_covariates = [
    'sovereign_spread_lag1',   # lagged outcome
    'cpi_yoy_lag1',
    'gdp_annual_growth_rate_lag1',
    'ln_gdp_per_capita_lag1',
    'debt_to_gdp_lag1',
    'deficit_to_gdp_lag1',    # mediator caution — we use lag1
    'current_account_balance_lag1',
    # governance (level or lagged)
    'wgi_cc', 'wgi_ge', 'wgi_pv', 'wgi_rl', 'wgi_rq', 'wgi_va',
    # imputation flags for these
    'sovereign_spread_lag1_impflag', 'cpi_yoy_lag1_impflag', 'gdp_annual_growth_rate_lag1_impflag',
    'ln_gdp_per_capita_lag1_impflag', 'debt_to_gdp_lag1_impflag', 'deficit_to_gdp_lag1_impflag',
    'current_account_balance_lag1_impflag'
]

# Extended covariates (if you have them) — add here
extended_covariates = baseline_covariates + [
    'sovereign_spread_lag2', 'cpi_yoy_lag2', 'gdp_annual_growth_rate_lag2', 'ln_gdp_per_capita_lag2',
    'debt_to_gdp_lag2', 'deficit_to_gdp_lag2', 'current_account_balance_lag2',
    # placeholder for global factors you might add:
    # 'VIX_lag1', 'US10y_lag1', 'Brent_lag1',
]

# The treatment is 'gain' (do not include it in covariates)
T_col = 'gain'
Y_col = 'sovereign_spread'
idcol = 'iso3c'
timecol = 'year'

7) Fold-aware preprocessing function (impute on train only, scale on train only, optional FE partial-out)

This is the core function to call inside your cross-fitting loop. It returns processed X_train, X_test, y_train, y_test, t_train, t_test and optionally the FE means and saved imputer/scaler objects.

In [20]:
from sklearn.pipeline import make_pipeline

def fold_aware_preprocess(train_df, test_df, covariates,
                          idcol='iso3c', ycol='sovereign_spread', tcol='gain',
                          imputer=None, scaler=None, include_country_fe=False,
                          save_prefix=None):
    """
    - train_df/test_df are pandas DataFrames for the fold
    - covariates: list of column names to use in X (these should include imputation flags)
    - imputer: sklearn imputer instance (if None, uses KNNImputer(n_neighbors=5))
    - scaler: sklearn scaler instance (if None, uses StandardScaler())
    - include_country_fe: if True, compute country means on train and demean Y, T, and covariates (fe partial-out)
    - save_prefix: optional path prefix to pickle imputer/scaler/fe_means
    Returns: dict with processed arrays/dataframes and saved artifacts paths
    """
    if imputer is None:
        imputer = KNNImputer(n_neighbors=5)
    if scaler is None:
        scaler = StandardScaler()

    # 1) Impute covariates using training only
    Xtrain_raw = train_df[covariates].copy()
    Xtest_raw  = test_df[covariates].copy()
    Xtrain_imp = pd.DataFrame(imputer.fit_transform(Xtrain_raw), columns=covariates, index=train_df.index)
    Xtest_imp  = pd.DataFrame(imputer.transform(Xtest_raw), columns=covariates, index=test_df.index)

    # save imputer
    artifacts = {}
    if save_prefix is not None:
        p_imputer = f"{save_prefix}_imputer.pkl"
        pickle.dump(imputer, open(p_imputer, 'wb'))
        artifacts['imputer'] = p_imputer

    # 2) optional: demean by country using training-based country means (for FE partial-out)
    # We'll compute FE means for y, t, and covariates based on train only
    if include_country_fe:
        cols_for_means = [ycol, tcol] + covariates
        # assemble train with imputed covariates for mean calc
        train_for_means = train_df[[ycol, tcol]].join(Xtrain_imp)
        fe_means = train_for_means.groupby(idcol).mean()
        # global train means for countries not in train (rare in LOYO, but safe)
        global_means = train_for_means.mean()

        # join means into train/test
        train_joined = train_for_means.join(fe_means, on=idcol, rsuffix='_mean')
        test_joined  = test_df[[ycol, tcol]].join(Xtest_imp).join(fe_means, on=idcol, rsuffix='_mean')

        # demean
        for col in [ycol, tcol] + covariates:
            mean_col = col + '_mean'
            train_joined[col + '_d'] = train_joined[col] - train_joined[mean_col].fillna(global_means[col])
            test_joined[col + '_d']  = test_joined[col]  - test_joined[mean_col].fillna(global_means[col])

        # X matrices for ML are demeaned covariates
        X_train = train_joined[[c + '_d' for c in covariates]].copy()
        X_test  = test_joined[[c + '_d' for c in covariates]].copy()
        y_train = train_joined[ycol + '_d'].copy()
        y_test  = test_joined[ycol + '_d'].copy()
        t_train = train_joined[tcol + '_d'].copy()
        t_test  = test_joined[tcol + '_d'].copy()

        # persist fe_means if requested
        if save_prefix is not None:
            p_femeans = f"{save_prefix}_fe_means.pkl"
            pickle.dump(fe_means, open(p_femeans, 'wb'))
            artifacts['fe_means'] = p_femeans

    else:
        # no FE partial-out: use imputed X directly
        X_train = Xtrain_imp.copy()
        X_test  = Xtest_imp.copy()
        y_train = train_df[ycol].copy()
        y_test  = test_df[ycol].copy()
        t_train = train_df[tcol].copy()
        t_test  = test_df[tcol].copy()

    # 3) scale features using training-only scaler
    X_train_cols = X_train.columns.tolist()
    scaler.fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train_cols, index=X_train.index)
    X_test_scaled  = pd.DataFrame(scaler.transform(X_test),  columns=X_train_cols, index=X_test.index)  # align columns

    if save_prefix is not None:
        p_scaler = f"{save_prefix}_scaler.pkl"
        pickle.dump(scaler, open(p_scaler, 'wb'))
        artifacts['scaler'] = p_scaler

    out = {
        'X_train': X_train_scaled, 'X_test': X_test_scaled,
        'y_train': y_train, 'y_test': y_test,
        't_train': t_train, 't_test': t_test,
        'artifacts': artifacts
    }
    return out

8) Example: how to use the preprocessing function in a leave-one-year-out fold loop

This snippet shows how to produce per-fold artifacts and store OOS p(X) R² for the demeaned T (helpful for FE diagnostic).

In [22]:
# Choose folds: leave-one-year-out
years = sorted(df['year'].unique())
folds = [(df.index[df['year'] != y].tolist(), df.index[df['year'] == y].tolist()) for y in years]

# Which covariates to feed to preprocess (choose baseline or extended)
covariates_to_use = baseline_covariates  # or extended_covariates

p_oos_r2_list = []
fe_include = True   # set True for FE-DML diagnostic; False for pooled spec
all_theta_parts = []  # optionally store residuals

for fnum, (train_idx, test_idx) in enumerate(folds):
    train = df.loc[train_idx].copy()
    test  = df.loc[test_idx].copy()

    save_prefix = f"artifacts/fold{fnum}"  # change path as you see fit

    prep = fold_aware_preprocess(train, test, covariates=covariates_to_use,
                                 idcol='iso3c', ycol=Y_col, tcol=T_col,
                                 imputer=KNNImputer(n_neighbors=5),
                                 scaler=StandardScaler(),
                                 include_country_fe=fe_include,
                                 save_prefix=save_prefix)

    X_train, X_test = prep['X_train'], prep['X_test']
    y_train, y_test = prep['y_train'], prep['y_test']
    t_train, t_test = prep['t_train'], prep['t_test']

    # Fit p(X) on demeaned T (LassoCV)
    p_model = LassoCV(cv=5, random_state=0).fit(X_train, t_train)
    p_hat_test = p_model.predict(X_test)
    # OOS R^2 for demeaned T
    r2 = np.nan
    if len(t_test)>0 and np.nanvar(t_test)>0:
        r2 = r2_score(t_test, p_hat_test)
    p_oos_r2_list.append(r2)

    # Fit m(X) for demeaned Y (random forest)
    m_model = RandomForestRegressor(n_estimators=200, random_state=0)
    m_model.fit(X_train, y_train)
    m_hat_test = m_model.predict(X_test)

    # residuals for stacking later
    u_hat = y_test.values - m_hat_test
    v_hat = t_test.values - p_hat_test

    # store in a dict or DataFrame for later stacking
    tmp = pd.DataFrame({
        'index': test.index,
        'u_hat': u_hat,
        'v_hat': v_hat
    }).set_index('index')

    all_theta_parts.append(tmp)

# After loop: stack residuals and compute theta (example)
stacked = pd.concat(all_theta_parts).sort_index()
u_all = stacked['u_hat']
v_all = stacked['v_hat']

theta_hat = (v_all * u_all).sum() / (v_all**2).sum()
print("Theta (DML-style point estimate from residuals):", theta_hat)
print("Median p(X) OOS R2 across folds:", np.nanmedian(p_oos_r2_list))

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/fold0_imputer.pkl'

# Phase 2


In [24]:
# CELL 1: imports and settings
import numpy as np
import pandas as pd
import pickle
import os
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

# SETTINGS - EDIT THESE
idcol = 'iso3c'
timecol = 'year'
Y_col = 'sovereign_spread'   # outcome
T_col = 'gain'               # treatment (ND-GAIN)
# Choose covariates prepared earlier (baseline from preprocessing)
covariates_to_use = [
    'sovereign_spread_lag1', 'cpi_yoy_lag1','gdp_annual_growth_rate_lag1',
    'ln_gdp_per_capita_lag1','debt_to_gdp_lag1','deficit_to_gdp_lag1',
    'current_account_balance_lag1',
    'wgi_cc','wgi_ge','wgi_pv','wgi_rl','wgi_rq','wgi_va',
    'sovereign_spread_lag1_impflag', 'cpi_yoy_lag1_impflag', 'gdp_annual_growth_rate_lag1_impflag',
    'ln_gdp_per_capita_lag1_impflag','debt_to_gdp_lag1_impflag','deficit_to_gdp_lag1_impflag',
    'current_account_balance_lag1_impflag'
]

# folds & artifacts
fold_output_dir = "artifacts/dml_pooled"
os.makedirs(fold_output_dir, exist_ok=True)

# modeling choices
n_trees = 300
use_xgboost = False   # set True if xgboost is installed and you want to use it
n_permutations = 200  # for permutation test (200 is reasonable; increase if you have time)
random_seed = 2025
np.random.seed(random_seed)