# 02 Exploratory Data Analysis

This notebook generates descriptive statistics and plots to explore the merged panel dataset.


In [4]:
import pandas as pd

# Load the data
gain = pd.read_csv("/Users/leosgambato/Documents/GitHub/Capstone/data/external/Data/nd_gain_countryindex_2025/resources/gain/gain_delta.csv")

# The data columns are: ISO3, Name, 1995, 1996, ..., 2023
# We want to reshape so that we have columns: iso3, date, GAIN

# Rename columns for consistency
gain = gain.rename(columns={'ISO3': 'iso3'})

# Melt the dataframe to long format: iso3, Name, year, GAIN
gain_long = gain.melt(
    id_vars=['iso3', 'Name'],
    var_name='year',
    value_name='GAIN'
)

# Convert year to integer, then to datetime (start of January)
gain_long['date'] = pd.to_datetime(gain_long['year'], format='%Y')

# Keep only iso3, GAIN, and date columns
gain_long = gain_long[['iso3', 'GAIN', 'date']]

# Preview cleaned data
gain_long.head()



Unnamed: 0,iso3,GAIN,date
0,AFG,-5.919105,1995-01-01
1,ALB,-0.014547,1995-01-01
2,DZA,1.853878,1995-01-01
3,AND,,1995-01-01
4,AGO,-7.255131,1995-01-01


In [7]:
import pandas as pd
import numpy as np

# df: country-month panel
# ensure date is datetime and data sorted
gain_long['date'] = pd.to_datetime(gain_long['date'])
gain_long = gain_long.sort_values(['iso3','date'])

# treatment column name
tcol = 'GAIN'  # replace with your ND-GAIN column name

# total variance
total_var = gain_long[tcol].var(ddof=1)

# between variance: variance of country means (across countries)
country_means = gain_long.groupby('iso3')[tcol].mean()
between_var = country_means.var(ddof=1)

# within variance = total - between (can be slightly negative numerically -> clip to 0)
within_var = max(total_var - between_var, 0.0)

within_share = within_var / total_var if total_var != 0 else np.nan

print(f"Total var = {total_var:.6f}, Between = {between_var:.6f}, Within = {within_var:.6f}")
print(f"Share within = {within_share:.2%}")

Total var = 60.853755, Between = 52.909917, Within = 7.943838
Share within = 13.05%


In [8]:
gain_long.to_csv("gain_long.csv", index=False)
