In [145]:
# Assuming:
# rd_exp_long → cleaned dataset with ['country', 'year', 'rd_exp_gdp']
# merged_df   → main dataset with ['country', 'year', 'total_score']

# --- Merge R&D data into main dataset ---
merged_rd = pd.merge(
    merged_df,
    rd_exp_long,
    on=["country", "year"],
    how="inner"
)

# Ensure numeric types
merged_rd['total_score'] = pd.to_numeric(merged_rd['total_score'], errors='coerce')
merged_rd['rd_exp_gdp'] = pd.to_numeric(merged_rd['rd_exp_gdp'], errors='coerce')

# Drop NaNs
corr_df = merged_rd.dropna(subset=['total_score', 'rd_exp_gdp'])

# --- Pearson ---
pearson_corr, pearson_p = pearsonr(corr_df['total_score'], corr_df['rd_exp_gdp'])

# --- Spearman ---
spearman_corr, spearman_p = spearmanr(corr_df['total_score'], corr_df['rd_exp_gdp'])

# --- Kendall ---
kendall_corr, kendall_p = kendalltau(corr_df['total_score'], corr_df['rd_exp_gdp'])

# --- Print ---
print("=== Correlation with R&D Expenditure (% of GDP) ===")
print(f"Pearson:  r = {pearson_corr:.4f}, p = {pearson_p:.4g}")
print(f"Spearman: rho = {spearman_corr:.4f}, p = {spearman_p:.4g}")
print(f"Kendall:  tau = {kendall_corr:.4f}, p = {kendall_p:.4g}")

=== Correlation with R&D Expenditure (% of GDP) ===
Pearson:  r = 0.3576, p = 8.018e-08
Spearman: rho = 0.6235, p = 2.432e-24
Kendall:  tau = 0.4143, p = 2.512e-19


In [140]:
# Ensure numeric
merged_df_edu['total_score'] = pd.to_numeric(merged_df_edu['total_score'], errors='coerce')
merged_df_edu['gov_exp_edu'] = pd.to_numeric(merged_df_edu['gov_exp_edu'], errors='coerce')

# Drop missing rows
corr_df = merged_df_edu.dropna(subset=['total_score', 'gov_exp_edu'])

# Pearson
pearson_corr, pearson_p = pearsonr(corr_df['total_score'], corr_df['gov_exp_edu'])

# Spearman
spearman_corr, spearman_p = spearmanr(corr_df['total_score'], corr_df['gov_exp_edu'])

# Kendall
kendall_corr, kendall_p = kendalltau(corr_df['total_score'], corr_df['gov_exp_edu'])

print("=== Correlation with Government Expenditure on Education ===")
print(f"Pearson:  r = {pearson_corr:.4f}, p = {pearson_p:.4g}")
print(f"Spearman: rho = {spearman_corr:.4f}, p = {spearman_p:.4g}")
print(f"Kendall:  tau = {kendall_corr:.4f}, p = {kendall_p:.4g}")

=== Correlation with Government Expenditure on Education ===
Pearson:  r = 0.1182, p = 0.08086
Spearman: rho = 0.2294, p = 0.0006235
Kendall:  tau = 0.1600, p = 0.0004323


In [140]:
# Ensure numeric
merged_df_edu['total_score'] = pd.to_numeric(merged_df_edu['total_score'], errors='coerce')
merged_df_edu['gov_exp_edu'] = pd.to_numeric(merged_df_edu['gov_exp_edu'], errors='coerce')

# Drop missing rows
corr_df = merged_df_edu.dropna(subset=['total_score', 'gov_exp_edu'])

# Pearson
pearson_corr, pearson_p = pearsonr(corr_df['total_score'], corr_df['gov_exp_edu'])

# Spearman
spearman_corr, spearman_p = spearmanr(corr_df['total_score'], corr_df['gov_exp_edu'])

# Kendall
kendall_corr, kendall_p = kendalltau(corr_df['total_score'], corr_df['gov_exp_edu'])

print("=== Correlation with Government Expenditure on Education ===")
print(f"Pearson:  r = {pearson_corr:.4f}, p = {pearson_p:.4g}")
print(f"Spearman: rho = {spearman_corr:.4f}, p = {spearman_p:.4g}")
print(f"Kendall:  tau = {kendall_corr:.4f}, p = {kendall_p:.4g}")

=== Correlation with Government Expenditure on Education ===
Pearson:  r = 0.1182, p = 0.08086
Spearman: rho = 0.2294, p = 0.0006235
Kendall:  tau = 0.1600, p = 0.0004323


In [143]:
# Ensure numeric
merged_df_gov['total_score'] = pd.to_numeric(merged_df_gov['total_score'], errors='coerce')
merged_df_gov['gov_effectiveness'] = pd.to_numeric(merged_df_gov['gov_effectiveness'], errors='coerce')

# Drop missing rows
corr_df = merged_df_gov.dropna(subset=['total_score', 'gov_effectiveness'])

# Pearson
pearson_corr, pearson_p = pearsonr(corr_df['total_score'], corr_df['gov_effectiveness'])

# Spearman
spearman_corr, spearman_p = spearmanr(corr_df['total_score'], corr_df['gov_effectiveness'])

# Kendall
kendall_corr, kendall_p = kendalltau(corr_df['total_score'], corr_df['gov_effectiveness'])

print("=== Correlation with Government Effectiveness ===")
print(f"Pearson:  r = {pearson_corr:.4f}, p = {pearson_p:.4g}")
print(f"Spearman: rho = {spearman_corr:.4f}, p = {spearman_p:.4g}")
print(f"Kendall:  tau = {kendall_corr:.4f}, p = {kendall_p:.4g}")

=== Correlation with Government Effectiveness ===
Pearson:  r = 0.2974, p = 7.534e-06
Spearman: rho = 0.5589, p = 2.161e-19
Kendall:  tau = 0.3668, p = 8.106e-16


In [143]:
# Ensure numeric
merged_df_gov['total_score'] = pd.to_numeric(merged_df_gov['total_score'], errors='coerce')
merged_df_gov['gov_effectiveness'] = pd.to_numeric(merged_df_gov['gov_effectiveness'], errors='coerce')

# Drop missing rows
corr_df = merged_df_gov.dropna(subset=['total_score', 'gov_effectiveness'])

# Pearson
pearson_corr, pearson_p = pearsonr(corr_df['total_score'], corr_df['gov_effectiveness'])

# Spearman
spearman_corr, spearman_p = spearmanr(corr_df['total_score'], corr_df['gov_effectiveness'])

# Kendall
kendall_corr, kendall_p = kendalltau(corr_df['total_score'], corr_df['gov_effectiveness'])

print("=== Correlation with Government Effectiveness ===")
print(f"Pearson:  r = {pearson_corr:.4f}, p = {pearson_p:.4g}")
print(f"Spearman: rho = {spearman_corr:.4f}, p = {spearman_p:.4g}")
print(f"Kendall:  tau = {kendall_corr:.4f}, p = {kendall_p:.4g}")

=== Correlation with Government Effectiveness ===
Pearson:  r = 0.2974, p = 7.534e-06
Spearman: rho = 0.5589, p = 2.161e-19
Kendall:  tau = 0.3668, p = 8.106e-16


In [125]:
top10_secondary["source"] = "primary"
top10_total_score["source"] = "Total_Score"

# Merge both lists
comparison_df = pd.concat([top10_secondary, top10_total_score], ignore_index=True)

In [125]:
top10_secondary["source"] = "primary"
top10_total_score["source"] = "Total_Score"

# Merge both lists
comparison_df = pd.concat([top10_secondary, top10_total_score], ignore_index=True)

In [125]:
top10_secondary["source"] = "primary"
top10_total_score["source"] = "Total_Score"

# Merge both lists
comparison_df = pd.concat([top10_secondary, top10_total_score], ignore_index=True)

In [126]:
# Initialize totals
total_overlap_sum = 0
total_baseline_sum = 0

# For the unique-country summary across the whole period:
all_overlap_countries = set()
all_union_countries = set()

for year in sorted(comparison_df["year"].unique()):
    # --- Select countries for this year ---
    secondary_countries = set(top10_secondary[top10_secondary["year"] == year]["country"])
    score_countries     = set(top10_total_score[top10_total_score["year"] == year]["country"])
    
    # --- Calculate overlap ---
    overlap = secondary_countries & score_countries
    
    # --- Yearly percentage (vs total_score list size) ---
    overlap_pct = (len(overlap) / len(score_countries) * 100) if len(score_countries) else 0.0
    
    # --- Print per-year ---
    print(f"{year} → Overlap: {len(overlap)} countries ({overlap_pct:.1f}%) → {sorted(overlap)}")
    
    # --- Accumulate for sum-based total ---
    total_overlap_sum  += len(overlap)
    total_baseline_sum += len(score_countries)  
    
    # --- Accumulate for unique-country total ---
    all_overlap_countries |= overlap
    all_union_countries   |= (secondary_countries | score_countries)

# --- Sum-based total ---
if total_baseline_sum > 0:
    total_pct_sum = total_overlap_sum / total_baseline_sum * 100
else:
    total_pct_sum = 0.0

print("\n=== TOTAL (sum-based) ===")
print(f"Overlaps: {total_overlap_sum} over baseline {total_baseline_sum} → {total_pct_sum:.1f}%")


2017 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United Kingdom']
2018 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United Kingdom']
2019 → Overlap: 2 countries (20.0%) → ['Australia', 'United Kingdom']
2020 → Overlap: 2 countries (20.0%) → ['Australia', 'Netherlands']
2021 → Overlap: 2 countries (20.0%) → ['Australia', 'Netherlands']
2022 → Overlap: 2 countries (20.0%) → ['Australia', 'Netherlands']

=== TOTAL (sum-based) ===
Overlaps: 14 over baseline 60 → 23.3%


In [130]:
top10_tertiary["source"] = "primary"
top10_total_score["source"] = "Total_Score"

# Merge both lists
comparison_df = pd.concat([top10_tertiary, top10_total_score], ignore_index=True)

In [131]:
total_overlap_sum = 0        # sum of overlaps across years
total_baseline_sum = 0       # sum of denominator sizes across years (here: score_countries)

# For the unique-country summary across the whole period:
all_overlap_countries = set()
all_union_countries = set()

for year in sorted(comparison_df["year"].unique()):
    # --- Select countries for this year ---
    tertiary_countries = set(top10_tertiary[top10_tertiary["year"] == year]["country"])
    score_countries    = set(top10_total_score[top10_total_score["year"] == year]["country"])
    
    # --- Yearly overlap ---
    overlap = tertiary_countries & score_countries
    
    # --- Yearly percentage (vs total_score list size) ---
    overlap_pct = (len(overlap) / len(score_countries) * 100) if len(score_countries) else 0.0
    
    # --- Print per-year ---
    print(f"{year} → Overlap: {len(overlap)} countries ({overlap_pct:.1f}%) → {sorted(overlap)}")
    
    # --- Accumulate for sum-based total ---
    total_overlap_sum  += len(overlap)
    total_baseline_sum += len(score_countries)  # change to len(tertiary_countries) if you prefer that baseline
    
    # --- Accumulate for unique-country total ---
    all_overlap_countries |= overlap
    all_union_countries   |= (tertiary_countries | score_countries)

# --- Sum-based total (e.g., '16 overlaps over 70 countries') ---
if total_baseline_sum > 0:
    total_pct_sum = total_overlap_sum / total_baseline_sum * 100
else:
    total_pct_sum = 0.0

print("\n=== TOTAL (sum-based) ===")
print(f"Overlaps: {total_overlap_sum} over baseline {total_baseline_sum} → {total_pct_sum:.1f}%")


2017 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2018 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2019 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2020 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2021 → Overlap: 2 countries (20.0%) → ['Australia', 'Netherlands']
2022 → Overlap: 2 countries (20.0%) → ['Australia', 'Netherlands']

=== TOTAL (sum-based) ===
Overlaps: 16 over baseline 60 → 26.7%


In [131]:
total_overlap_sum = 0        # sum of overlaps across years
total_baseline_sum = 0       # sum of denominator sizes across years (here: score_countries)

# For the unique-country summary across the whole period:
all_overlap_countries = set()
all_union_countries = set()

for year in sorted(comparison_df["year"].unique()):
    # --- Select countries for this year ---
    tertiary_countries = set(top10_tertiary[top10_tertiary["year"] == year]["country"])
    score_countries    = set(top10_total_score[top10_total_score["year"] == year]["country"])
    
    # --- Yearly overlap ---
    overlap = tertiary_countries & score_countries
    
    # --- Yearly percentage (vs total_score list size) ---
    overlap_pct = (len(overlap) / len(score_countries) * 100) if len(score_countries) else 0.0
    
    # --- Print per-year ---
    print(f"{year} → Overlap: {len(overlap)} countries ({overlap_pct:.1f}%) → {sorted(overlap)}")
    
    # --- Accumulate for sum-based total ---
    total_overlap_sum  += len(overlap)
    total_baseline_sum += len(score_countries)  # change to len(tertiary_countries) if you prefer that baseline
    
    # --- Accumulate for unique-country total ---
    all_overlap_countries |= overlap
    all_union_countries   |= (tertiary_countries | score_countries)

# --- Sum-based total (e.g., '16 overlaps over 70 countries') ---
if total_baseline_sum > 0:
    total_pct_sum = total_overlap_sum / total_baseline_sum * 100
else:
    total_pct_sum = 0.0

print("\n=== TOTAL (sum-based) ===")
print(f"Overlaps: {total_overlap_sum} over baseline {total_baseline_sum} → {total_pct_sum:.1f}%")


2017 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2018 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2019 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2020 → Overlap: 3 countries (30.0%) → ['Australia', 'Netherlands', 'United States']
2021 → Overlap: 2 countries (20.0%) → ['Australia', 'Netherlands']
2022 → Overlap: 2 countries (20.0%) → ['Australia', 'Netherlands']

=== TOTAL (sum-based) ===
Overlaps: 16 over baseline 60 → 26.7%


In [67]:
# Add an indicator column so we can identify source
top10_per_year_gdp["source"] = "GDP"
top10_total_score["source"] = "Total_Score"

# Merge both lists
comparison_df = pd.concat([top10_per_year_gdp, top10_total_score], ignore_index=True)

In [66]:
# Assuming your total_score data is in df_total_score with columns: country, year, total_score
top10_total_score = (
    total_score_df
    .sort_values(["year", "total_score"], ascending=[True, False])
    .groupby("year")
    .head(10)
    .reset_index(drop=True))

In [268]:
# Initialize totals
total_overlap_sum = 0
total_baseline_sum = 0

# For the unique-country summary across the whole period:
all_overlap_countries = set()
all_union_countries = set()

for year in sorted(comparison_df["year"].unique()):
    # --- Select countries for this year ---
    gdp_countries   = set(top10_per_year_gdp[top10_per_year_gdp["year"] == year]["country"])
    score_countries = set(top10_total_score[top10_total_score["year"] == year]["country"])
    
    # --- Calculate overlap ---
    overlap = gdp_countries & score_countries
    
    # --- Yearly percentage (vs total_score list size) ---
    overlap_pct = (len(overlap) / len(score_countries) * 100) if len(score_countries) else 0.0
    
    # --- Print per-year ---
    print(f"{year} → Overlap: {len(overlap)} countries ({overlap_pct:.1f}%) → {sorted(overlap)}")
    
    # --- Accumulate for sum-based total ---
    total_overlap_sum  += len(overlap)
    total_baseline_sum += len(score_countries)      
    
    # --- Accumulate for unique-country total ---
    all_overlap_countries |= overlap
    all_union_countries   |= (gdp_countries | score_countries)

# --- Sum-based total ---
if total_baseline_sum > 0:
    total_pct_sum = total_overlap_sum / total_baseline_sum * 100
else:
    total_pct_sum = 0.0

print("\n=== TOTAL (sum-based) ===")
print(f"Overlaps: {total_overlap_sum} over baseline {total_baseline_sum} → {total_pct_sum:.1f}%")

2017 → Overlap: 1 countries (10.0%) → ['Switzerland']
2018 → Overlap: 0 countries (0.0%) → []
2019 → Overlap: 0 countries (0.0%) → []
2020 → Overlap: 2 countries (20.0%) → ['Switzerland', 'United States']
2021 → Overlap: 1 countries (10.0%) → ['Switzerland']
2022 → Overlap: 1 countries (10.0%) → ['Switzerland']

=== TOTAL (sum-based) ===
Overlaps: 5 over baseline 60 → 8.3%


In [270]:
# Initialize totals
total_overlap_sum = 0
total_baseline_sum = 0

# For the unique-country summary across the whole period:
all_overlap_countries = set()
all_union_countries = set()

for year in sorted(comparison_df["year"].unique()):
    # --- Select countries for this year ---
    gov_countries   = set(top10_per_year_gov_eff[top10_per_year_gov_eff["year"] == year]["country"])
    score_countries = set(top10_total_score[top10_total_score["year"] == year]["country"])
    
    # --- Calculate overlap ---
    overlap = gov_countries & score_countries
    
    # --- Yearly percentage (vs total_score list size) ---
    overlap_pct = (len(overlap) / len(score_countries) * 100) if len(score_countries) else 0.0
    
    # --- Print per-year ---
    print(f"{year} → Overlap: {len(overlap)} countries ({overlap_pct:.1f}%) → {sorted(overlap)}")
    
    # --- Accumulate for sum-based total ---
    total_overlap_sum  += len(overlap)
    total_baseline_sum += len(score_countries)      
    
    # --- Accumulate for unique-country total ---
    all_overlap_countries |= overlap
    all_union_countries   |= (gov_countries | score_countries)

# --- Sum-based total ---
if total_baseline_sum > 0:
    total_pct_sum = total_overlap_sum / total_baseline_sum * 100
else:
    total_pct_sum = 0.0

print("\n=== TOTAL (sum-based) ===")
print(f"Overlaps: {total_overlap_sum} over baseline {total_baseline_sum} → {total_pct_sum:.1f}%")


2017 → Overlap: 3 countries (30.0%) → ['Canada', 'Netherlands', 'Switzerland']
2018 → Overlap: 1 countries (10.0%) → ['Netherlands']
2019 → Overlap: 2 countries (20.0%) → ['Canada', 'Netherlands']
2020 → Overlap: 2 countries (20.0%) → ['Netherlands', 'Switzerland']
2021 → Overlap: 2 countries (20.0%) → ['Netherlands', 'Switzerland']
2022 → Overlap: 2 countries (20.0%) → ['Japan', 'Switzerland']

=== TOTAL (sum-based) ===
Overlaps: 12 over baseline 60 → 20.0%


In [272]:
# Initialize totals
total_overlap_sum = 0
total_baseline_sum = 0

# For the unique-country summary across the whole period:
all_overlap_countries = set()
all_union_countries = set()

for year in sorted(comparison_df["year"].unique()):
    # --- Select countries for this year ---
    res_countries   = set(top10_per_year_res[top10_per_year_res["year"] == year]["country"])
    score_countries = set(top10_total_score[top10_total_score["year"] == year]["country"])
    
    # --- Calculate overlap ---
    overlap = res_countries & score_countries
    
    # --- Yearly percentage (vs total_score list size) ---
    overlap_pct = (len(overlap) / len(score_countries) * 100) if len(score_countries) else 0.0
    
    # --- Print per-year ---
    print(f"{year} → Overlap: {len(overlap)} countries ({overlap_pct:.1f}%) → {sorted(overlap)}")
    
    # --- Accumulate for sum-based total ---
    total_overlap_sum  += len(overlap)
    total_baseline_sum += len(score_countries)      
    
    # --- Accumulate for unique-country total ---
    all_overlap_countries |= overlap
    all_union_countries   |= (res_countries | score_countries)

# --- Sum-based total ---
if total_baseline_sum > 0:
    total_pct_sum = total_overlap_sum / total_baseline_sum * 100
else:
    total_pct_sum = 0.0

print("\n=== TOTAL (sum-based) ===")
print(f"Overlaps: {total_overlap_sum} over baseline {total_baseline_sum} → {total_pct_sum:.1f}%")

2017 → Overlap: 4 countries (40.0%) → ['Germany', 'Japan', 'Switzerland', 'United States']
2018 → Overlap: 3 countries (30.0%) → ['Germany', 'Japan', 'United States']
2019 → Overlap: 3 countries (30.0%) → ['Germany', 'Japan', 'United States']
2020 → Overlap: 3 countries (30.0%) → ['Japan', 'Switzerland', 'United States']
2021 → Overlap: 3 countries (30.0%) → ['Japan', 'Switzerland', 'United States']
2022 → Overlap: 3 countries (30.0%) → ['Japan', 'Switzerland', 'United States']

=== TOTAL (sum-based) ===
Overlaps: 19 over baseline 60 → 31.7%


In [156]:
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler

# --- 1) Standardize predictors ---
X_vars = ["gdp_per_capita", "gov_effectiveness", "gov_exp_edu", "rd_exp_gdp"]

scaler = StandardScaler()
analytic_std = analytic.copy()
analytic_std[X_vars] = scaler.fit_transform(analytic_std[X_vars])

# --- 2) Models ---

# Model 1: No FE
model_no_fe = smf.ols(
    "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp",
    data=analytic_std
).fit()

# Model 2: Year FE only
model_year_fe = smf.ols(
    "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(year)",
    data=analytic_std
).fit()

# Model 3: Country FE only
model_country_fe = smf.ols(
    "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(country)",
    data=analytic_std
).fit()

# Model 4: Both Country & Year FE
model_both_fe = smf.ols(
    "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(country) + C(year)",
    data=analytic_std
).fit()

# --- 3) R² comparison ---
print("\n=== Multiple Regression (OLS, standardized predictors) ===")
print(f"No FE:           R² = {model_no_fe.rsquared:.3f}")
print(f"Year FE only:    R² = {model_year_fe.rsquared:.3f}")
print(f"Country FE only: R² = {model_country_fe.rsquared:.3f}")
print(f"Both FE:         R² = {model_both_fe.rsquared:.3f}")

# --- 4) Coefficients (standardized, easier to compare) ---
print("\nStandardized Coefficients (No FE):")
print(model_no_fe.params[X_vars].sort_values(ascending=False))

print("\nStandardized Coefficients (Year FE):")
print(model_year_fe.params[X_vars].sort_values(ascending=False))

print("\nStandardized Coefficients (Country FE):")
print(model_country_fe.params[X_vars].sort_values(ascending=False))

print("\nStandardized Coefficients (Both FE):")
print(model_both_fe.params[X_vars].sort_values(ascending=False))

# --- 5) Optionally save summaries ---
with open("ols_no_fe_std.txt", "w") as f: f.write(model_no_fe.summary().as_text())
with open("ols_year_fe_std.txt", "w") as f: f.write(model_year_fe.summary().as_text())
with open("ols_country_fe_std.txt", "w") as f: f.write(model_country_fe.summary().as_text())
with open("ols_both_fe_std.txt", "w") as f: f.write(model_both_fe.summary().as_text())


=== Multiple Regression (OLS, standardized predictors) ===
No FE:           R² = 0.144
Year FE only:    R² = 0.149
Country FE only: R² = 0.997
Both FE:         R² = 0.997

Standardized Coefficients (No FE):
rd_exp_gdp           229.074059
gdp_per_capita        87.754506
gov_effectiveness     -3.259928
gov_exp_edu          -62.032766
dtype: float64

Standardized Coefficients (Year FE):
rd_exp_gdp           233.174246
gdp_per_capita       104.253810
gov_effectiveness    -19.816521
gov_exp_edu          -67.684633
dtype: float64

Standardized Coefficients (Country FE):
gov_effectiveness     38.554986
gov_exp_edu           -7.371924
gdp_per_capita       -33.771400
rd_exp_gdp          -181.759042
dtype: float64

Standardized Coefficients (Both FE):
gov_effectiveness     38.159081
gov_exp_edu           -6.309239
gdp_per_capita       -37.464076
rd_exp_gdp          -163.224451
dtype: float64


In [155]:
# --- Model 1: No Fixed Effects ---
formula_no_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp"
ols_no_fe = smf.ols(formula=formula_no_fe, data=analytic).fit()

# --- Model 2: Year Fixed Effects only ---
formula_year_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(year)"
ols_year_fe = smf.ols(formula=formula_year_fe, data=analytic).fit()

# --- Model 3: Country Fixed Effects only ---
formula_country_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(country)"
ols_country_fe = smf.ols(formula=formula_country_fe, data=analytic).fit()

# --- Model 4: Both Country & Year Fixed Effects ---
formula_both_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(country) + C(year)"
ols_both_fe = smf.ols(formula=formula_both_fe, data=analytic).fit()

# --- Print R² comparison ---
print("\n=== Fixed Effects Comparison ===")
print(f"No FE:           R² = {ols_no_fe.rsquared:.3f}")
print(f"Year FE only:    R² = {ols_year_fe.rsquared:.3f}")
print(f"Country FE only: R² = {ols_country_fe.rsquared:.3f}")
print(f"Both FE:         R² = {ols_both_fe.rsquared:.3f}")

# --- Optional: check key coefficients for Year FE model ---
print("\nYear FE model coefficients:")
print(ols_year_fe.params[["gdp_per_capita","gov_effectiveness","gov_exp_edu","rd_exp_gdp"]])

# --- Optional: save summaries to text files ---
with open("ols_no_fe.txt", "w") as f: f.write(ols_no_fe.summary().as_text())
with open("ols_year_fe.txt", "w") as f: f.write(ols_year_fe.summary().as_text())
with open("ols_country_fe.txt", "w") as f: f.write(ols_country_fe.summary().as_text())
with open("ols_both_fe.txt", "w") as f: f.write(ols_both_fe.summary().as_text())



=== Fixed Effects Comparison ===
No FE:           R² = 0.144
Year FE only:    R² = 0.149
Country FE only: R² = 0.997
Both FE:         R² = 0.997

Year FE model coefficients:
gdp_per_capita         0.004059
gov_effectiveness     -1.011625
gov_exp_edu          -49.819332
rd_exp_gdp           190.766878
dtype: float64


In [155]:
# --- Model 1: No Fixed Effects ---
formula_no_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp"
ols_no_fe = smf.ols(formula=formula_no_fe, data=analytic).fit()

# --- Model 2: Year Fixed Effects only ---
formula_year_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(year)"
ols_year_fe = smf.ols(formula=formula_year_fe, data=analytic).fit()

# --- Model 3: Country Fixed Effects only ---
formula_country_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(country)"
ols_country_fe = smf.ols(formula=formula_country_fe, data=analytic).fit()

# --- Model 4: Both Country & Year Fixed Effects ---
formula_both_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(country) + C(year)"
ols_both_fe = smf.ols(formula=formula_both_fe, data=analytic).fit()

# --- Print R² comparison ---
print("\n=== Fixed Effects Comparison ===")
print(f"No FE:           R² = {ols_no_fe.rsquared:.3f}")
print(f"Year FE only:    R² = {ols_year_fe.rsquared:.3f}")
print(f"Country FE only: R² = {ols_country_fe.rsquared:.3f}")
print(f"Both FE:         R² = {ols_both_fe.rsquared:.3f}")

# --- Optional: check key coefficients for Year FE model ---
print("\nYear FE model coefficients:")
print(ols_year_fe.params[["gdp_per_capita","gov_effectiveness","gov_exp_edu","rd_exp_gdp"]])

# --- Optional: save summaries to text files ---
with open("ols_no_fe.txt", "w") as f: f.write(ols_no_fe.summary().as_text())
with open("ols_year_fe.txt", "w") as f: f.write(ols_year_fe.summary().as_text())
with open("ols_country_fe.txt", "w") as f: f.write(ols_country_fe.summary().as_text())
with open("ols_both_fe.txt", "w") as f: f.write(ols_both_fe.summary().as_text())



=== Fixed Effects Comparison ===
No FE:           R² = 0.144
Year FE only:    R² = 0.149
Country FE only: R² = 0.997
Both FE:         R² = 0.997

Year FE model coefficients:
gdp_per_capita         0.004059
gov_effectiveness     -1.011625
gov_exp_edu          -49.819332
rd_exp_gdp           190.766878
dtype: float64


In [154]:
# === 0) Imports
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

# -------------------------------------------------------------------
# === 1) Normalize column names & fix typos
def normalize_cols(df):
    df = df.copy()
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(r"\s+", "_", regex=True)
    )
    rename_map = {
        "ountry": "country",
        "total score": "total_score",
        "gdp_percapita": "gdp_per_capita",
        "gov.effectivnes": "gov_effectiveness",
        "gov.effectiveness": "gov_effectiveness",
        "gov_exp_on_education": "gov_exp_edu",
        "reseacrh_and_development": "rd_exp_gdp",
        "research_and_development": "rd_exp_gdp",
        "rd_exp_%gdp": "rd_exp_gdp",
        "rd_exp_gdp(%)": "rd_exp_gdp",
    }
    df = df.rename(columns=rename_map)
    return df

merged_df      = normalize_cols(merged_df)
merged_df_edu  = normalize_cols(merged_df_edu)
merged_df_gov  = normalize_cols(merged_df_gov)
merged_rd      = normalize_cols(merged_rd)

base = merged_df[["country","year","gdp_per_capita","total_score"]].copy()
edu  = merged_df_edu[["country","year","gov_exp_edu","total_score"]].copy()
gov  = merged_df_gov[["country","year","gov_effectiveness","total_score"]].copy()
rd   = merged_rd[["country","year","rd_exp_gdp","total_score"]].copy()

# -------------------------------------------------------------------
# === 2) Deduplicate
def dedupe(df, cols_keep):
    return (
        df.groupby(["country","year"], as_index=False)
          .agg({c:"mean" for c in cols_keep if c not in ["country","year"]})
    )

base = dedupe(base, base.columns)
edu  = dedupe(edu,  edu.columns)
gov  = dedupe(gov,  gov.columns)
rd   = dedupe(rd,   rd.columns)

# -------------------------------------------------------------------
# === 3) Merge into single analytic dataset
analytic = (
    base
      .merge(edu[["country","year","gov_exp_edu"]], on=["country","year"], how="left")
      .merge(gov[["country","year","gov_effectiveness"]], on=["country","year"], how="left")
      .merge(rd[["country","year","rd_exp_gdp"]], on=["country","year"], how="left")
)

analytic = analytic.query("year >= 2017 and year <= 2022")
analytic = analytic.dropna(subset=["total_score","gdp_per_capita","gov_exp_edu","gov_effectiveness","rd_exp_gdp"]).copy()

print("Rows:", len(analytic))
print(analytic.head())

analytic.to_csv("analytic_multivariate_dataset.csv", index=False)

# -------------------------------------------------------------------
# === 4) VIF
X_vars = ["gdp_per_capita","gov_effectiveness","gov_exp_edu","rd_exp_gdp"]
X_scaled = StandardScaler().fit_transform(analytic[X_vars])
vif = pd.DataFrame({
    "feature": X_vars,
    "VIF": [variance_inflation_factor(X_scaled, i) for i in range(X_scaled.shape[1])]
})
print("\nVariance Inflation Factors (VIF):\n", vif.sort_values("VIF", ascending=False))

# -------------------------------------------------------------------
# === 5) OLS (no fixed effects)
X = sm.add_constant(analytic[X_vars])
y = analytic["total_score"].values
ols_plain = sm.OLS(y, X).fit()
print("\n=== OLS (no fixed effects) ===")
print(ols_plain.summary())

# -------------------------------------------------------------------
# === 6) OLS with fixed effects
formula_fe = "total_score ~ gdp_per_capita + gov_effectiveness + gov_exp_edu + rd_exp_gdp + C(country) + C(year)"
ols_fe = smf.ols(formula=formula_fe, data=analytic).fit()
print("\n=== OLS with Country & Year Fixed Effects ===")
print(ols_fe.summary())

# -------------------------------------------------------------------
# === 7) Standardized coefficients
scaler_X = StandardScaler()
scaler_y = StandardScaler()
Xs = scaler_X.fit_transform(analytic[X_vars])
ys = scaler_y.fit_transform(analytic[["total_score"]]).ravel()
ols_std = sm.OLS(ys, sm.add_constant(Xs)).fit()
beta_coefs = pd.Series(ols_std.params[1:], index=X_vars, name="std_beta")
print("\nStandardized Betas (plain OLS):\n", beta_coefs.sort_values(ascending=False))

# -------------------------------------------------------------------
# === 8) Ridge & Lasso
alphas = np.logspace(-3, 3, 25)
ridge = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", RidgeCV(alphas=alphas, cv=5))
])
lasso = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", LassoCV(alphas=alphas, cv=5, max_iter=10000))
])
X_mat = analytic[X_vars].values
y_vec = analytic["total_score"].values

ridge.fit(X_mat, y_vec)
lasso.fit(X_mat, y_vec)

ridge_coefs = pd.Series(ridge.named_steps["model"].coef_, index=X_vars, name="ridge_coef")
lasso_coefs = pd.Series(lasso.named_steps["model"].coef_, index=X_vars, name="lasso_coef")

print("\nRidge best alpha:", ridge.named_steps["model"].alpha_)
print("Ridge coefficients:\n", ridge_coefs.sort_values(ascending=False))
print("\nLasso best alpha:", lasso.named_steps["model"].alpha_)
print("Lasso coefficients:\n", lasso_coefs.sort_values(ascending=False))

# -------------------------------------------------------------------
# === 9) Metrics with updated RMSE
def metrics(y_true, y_pred, label):
    print(f"\n[{label}] R2={r2_score(y_true,y_pred):.3f}  "
          f"MAE={mean_absolute_error(y_true,y_pred):.2f}  "
          f"RMSE={root_mean_squared_error(y_true,y_pred):.2f}")

metrics(y_vec, ols_plain.predict(X), "OLS no-FE")
metrics(y_vec, ols_fe.fittedvalues, "OLS with FE")
metrics(y_vec, ridge.predict(X_mat), "Ridge")
metrics(y_vec, lasso.predict(X_mat), "Lasso")

# -------------------------------------------------------------------
# === 10) Save OLS plain coefficient table
coef_table = pd.DataFrame({
    "variable": ["const"] + X_vars,
    "coef": ols_plain.params,
    "std_err": ols_plain.bse,
    "t": ols_plain.tvalues,
    "pval": ols_plain.pvalues
})
coef_table.to_csv("ols_plain_coef_table.csv", index=False)
print("\nSaved: ols_plain_coef_table.csv")


Rows: 213
     country  year  gdp_per_capita  total_score  gov_exp_edu  \
0  Argentina  2017    14532.500931         65.8      5.45432   
1  Argentina  2018    11752.799892         69.1      4.87774   
2  Argentina  2019     9955.974787         66.2      4.77165   
3  Argentina  2020     8535.599380         66.0      5.27690   
4  Argentina  2021    10738.017922         67.5      4.64117   

   gov_effectiveness  rd_exp_gdp  
0          58.571430     0.55631  
1          52.857143     0.48830  
2          48.095238     0.47813  
3          43.333332     0.54126  
4          37.142857     0.52216  

Variance Inflation Factors (VIF):
              feature       VIF
1  gov_effectiveness  3.334924
0     gdp_per_capita  2.841605
3         rd_exp_gdp  2.162625
2        gov_exp_edu  1.282044

=== OLS (no fixed effects) ===
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.144
Model:        