# Recycling in Europe: Analysis of Drivers & Outcomes

## Objective
Analyze the socio-economic factors that influence recycling rates across European countries, focusing on **policy-actionable variables** that policymakers can directly influence.

## Approach
1. **Data Preparation**: Load and clean the merged dataset (2019-2023)
2. **Variable Classification**: Separate targets, policy drivers, and contextual variables
3. **Exploratory Analysis**: Correlation analysis and country clustering
4. **Regression Analysis**: Quantify the impact of policy-relevant variables on recycling rates
5. **Visualizations**: Generate insights for policymakers

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import statsmodels.api as sm
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load the data
df = pd.read_csv('processed_data/preprocessed_all.csv')
print(f"Dataset shape: {df.shape}")
print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print(f"Number of countries: {df['country_name'].nunique()}")

# ============================================================================
# VARIABLE CLASSIFICATION & TIME PERIOD
# ============================================================================

# --- TARGET VARIABLES (Recycling Outcomes) ---
targets = [
    'recycling_rate',  # Main focus
    'recycling_rate_glass',
    'recycling_rate_metallic',
    'recycling_rate_packaging',
    'recycling_rate_paper',
    'recycling_rate_plastic',
    'recycling_rate_wooden'
]

# --- POLICY-RELEVANT PREDICTORS (Actionable by policymakers) ---
policy_drivers = [
    'total_environm_tax_per_capita',   # Pollution/environmental taxes → CAN be set by policy
    'pollut_environm_tax_per_capita',
    'priv_inv_per_capita',             # Private investment → Can be incentivized by policy
    'government_effectiveness_estimate' # Institutional quality → Can be improved via governance
]

# --- CONTEXTUAL VARIABLES (Economic structure, cannot be directly controlled) ---
context_vars = [
    'gdp_per_capita',                   # Wealth level (structural)
    'gr_val_add_per_capita',            # Economic sophistication (structural)
    'urban_population_pct',             # Urbanization (structural)
    'renewable_energy_pct',             # Energy transition (policy outcome, not predictor)
    'highschool_completed_pct',
    'household_exp_percapita'
]

# --- TIME PERIOD FOR ANALYSIS ---
start_year = 2019
end_year = 2023

print(f"\nAnalysis period: {start_year}-{end_year}")


In [None]:
# ============================================================================
# STEP 1: DATA CLEANING & PREPARATION
# ============================================================================

# Filter recent data (sufficient data availability)
df_recent = df[(df['year'] >= start_year) & (df['year'] <= end_year)].copy()

print(f"Records in analysis period: {len(df_recent)}")
print(f"Countries: {df_recent['country_name'].nunique()}")

# --- AGGREGATE BY COUNTRY (average across 2019-2023) ---
all_vars = targets + policy_drivers + context_vars
existing_cols = [c for c in all_vars if c in df_recent.columns]

df_avg = df_recent.groupby('country_name')[existing_cols].mean().reset_index()

# Exclude Luxembourg (data issues)
df_avg = df_avg[df_avg['country_name'] != 'Luxembourg']

print(f"\nCountries before dropping NAs: {len(df_avg)}")
print(f"Missing data per variable (%):")
print((df_avg[existing_cols].isna().sum() / len(df_avg) * 100).sort_values(ascending=False).head(10))

# For DETAILED analysis (regression): require complete data on targets + policy drivers
regression_cols = ['recycling_rate'] + policy_drivers
df_clean_regression = df_avg.dropna(subset=regression_cols).copy()

print(f"\nCountries with complete data for regression: {len(df_clean_regression)}")
print("✓ Data ready for analysis")

In [None]:
# ============================================================================
# STEP 2: EXPLORATORY ANALYSIS - Correlation & Country Clustering
# ============================================================================

# --- 2.1: CORRELATION WITH RECYCLING RATE (all available data) ---
print("\n=== CORRELATION WITH GENERAL RECYCLING RATE ===\n")

df_corr_base = df.groupby('country_name')[targets + policy_drivers + context_vars].mean().reset_index().dropna()

correlations = []
for var in policy_drivers + context_vars:
    if var in df_corr_base.columns:
        r, p = pearsonr(df_corr_base['recycling_rate'], df_corr_base[var])
        sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else ""
        correlations.append({
            'Variable': var,
            'Correlation': r,
            'P-value': p,
            'Significant': sig
        })

corr_df = pd.DataFrame(correlations).sort_values('Correlation', ascending=False)
print(corr_df.to_string(index=False))

# --- 2.2: CLUSTERING ANALYSIS (identify country groups) ---
print("\n=== COUNTRY CLUSTERING ===\n")

features_cluster = ['gdp_per_capita', 'recycling_rate', 'government_effectiveness_estimate']
X_cluster = df_clean_regression[features_cluster].copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df_clean_regression['cluster'] = kmeans.fit_predict(X_scaled)

# Visualize clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=df_clean_regression,
    x='gdp_per_capita',
    y='recycling_rate',
    hue='cluster',
    palette='Set2',
    s=200,
    alpha=0.7,
    edgecolor='black',
    linewidth=1
)

# Label key countries
key_countries = ['Austria', 'Germany', 'Belgium', 'Spain', 'Italy', 'Poland', 'Romania', 'Bulgaria']
for _, row in df_clean_regression.iterrows():
    if row['country_name'] in key_countries:
        plt.annotate(
            row['country_name'],
            xy=(row['gdp_per_capita'], row['recycling_rate']),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=9,
            weight='bold'
        )

plt.title("Country Clusters: Wealth vs. Recycling Performance", fontsize=14, fontweight='bold')
plt.xlabel("GDP per Capita (€)")
plt.ylabel("General Recycling Rate (%)")
plt.grid(alpha=0.3)
plt.legend(title='Cluster', loc='best')
plt.tight_layout()
plt.savefig("01_country_clusters.png", dpi=300, bbox_inches='tight')
plt.show()

print("\nCluster Profiles:")
print(df_clean_regression.groupby('cluster')[features_cluster].mean())

In [None]:
# ============================================================================
# STEP 3: REGRESSION ANALYSIS - Policy Impact on Recycling Rates
# ============================================================================

print("\n=== OLS REGRESSION: Impact of Policy Drivers on General Recycling Rate ===\n")

# Dependent variable
target = 'recycling_rate'

# Independent variables (only policy-relevant)
predictors = policy_drivers

# Prepare data
X = df_clean_regression[predictors].copy()
y = df_clean_regression[target].copy()

print(f"Sample size: {len(X)} countries")
print(f"Predictors: {predictors}\n")

# Standardize predictors for interpretability of coefficients
X_std = (X - X.mean()) / X.std()
X_std = sm.add_constant(X_std)

# Fit OLS regression
model = sm.OLS(y, X_std).fit()

# Print results
print(model.summary())

# --- Visualization: Coefficient Magnitudes ---
coefs = model.params.drop('const').sort_values()

fig, ax = plt.subplots(figsize=(10, 5))
colors = ['#2ecc71' if x > 0 else '#e74c3c' for x in coefs.values]
coefs.plot(kind='barh', ax=ax, color=colors, edgecolor='black')

ax.set_title("Policy Impact on Recycling Rate\n(Standardized Coefficients)", fontsize=14, fontweight='bold')
ax.set_xlabel("Effect Size (β, in std. deviations)")
ax.axvline(0, color='black', linewidth=1, linestyle='--')
ax.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig("02_policy_impact_regression.png", dpi=300, bbox_inches='tight')
plt.show()

# Summary interpretation
print("\n" + "="*70)
print("INTERPRETATION: Standardized Coefficients")
print("="*70)
print("Each coefficient shows the change in recycling rate (%) per 1 std dev")
print("increase in the predictor, holding other variables constant.\n")
for var, coef in coefs.items():
    print(f"  {var:40s}: {coef:+.3f}")

In [None]:
# ============================================================================
# STEP 4: DETAILED CORRELATIONS - All Recycling Types vs All Drivers
# ============================================================================

print("\n=== CORRELATION MATRIX: All Recycling Targets vs All Drivers ===\n")

# Calculate correlations with p-values for all combinations
df_full_corr = df.groupby('country_name')[targets + policy_drivers + context_vars].mean().reset_index().dropna()

corr_results = []
for t in targets:
    for d in policy_drivers + context_vars:
        if t in df_full_corr.columns and d in df_full_corr.columns:
            r, p = pearsonr(df_full_corr[d], df_full_corr[t])
            stars = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else ""
            corr_results.append({
                'Target': t,
                'Driver': d,
                'r': r,
                'p': p,
                'label': f"{r:.2f}{stars}"
            })

df_corr_results = pd.DataFrame(corr_results)

# Prepare for heatmap visualization
pivot_r = df_corr_results.pivot(index='Driver', columns='Target', values='r')
pivot_lbl = df_corr_results.pivot(index='Driver', columns='Target', values='label')

# Clean labels for visualization
rename_targets = {
    'recycling_rate': 'General',
    'recycling_rate_glass': 'Glass',
    'recycling_rate_metallic': 'Metal',
    'recycling_rate_packaging': 'Packaging',
    'recycling_rate_paper': 'Paper',
    'recycling_rate_plastic': 'Plastic',
    'recycling_rate_wooden': 'Wood'
}

rename_drivers = {
    'total_environm_tax_per_capita': 'Total Env Tax per Capita',
    'priv_inv_per_capita': 'Private Investment',
    'government_effectiveness_estimate': 'Gov. Effectiveness',
    'gdp_per_capita': 'GDP per Capita',
    'gr_val_add_per_capita': 'Value Added per Capita',
    'urban_population_pct': 'Urban Population %',
    'renewable_energy_pct': 'Renewable Energy %',
    'pollut_environm_tax_per_capita': 'Pollution Tax per Capita'
}

pivot_r_renamed = pivot_r.rename(index=rename_drivers, columns=rename_targets)
pivot_lbl_renamed = pivot_lbl.rename(index=rename_drivers, columns=rename_targets)

# Heatmap
fig, ax = plt.subplots(figsize=(12, 7))
sns.heatmap(
    pivot_r_renamed,
    annot=pivot_lbl_renamed.values,
    fmt='',
    cmap='RdYlGn',
    center=0,
    vmin=-1,
    vmax=1,
    cbar_kws={"label": "Pearson Correlation (r)"},
    linewidths=1,
    linecolor='gray',
    ax=ax
)

ax.set_title("Correlation Matrix: Recycling Outcomes vs. Socio-Economic Drivers\n(*** p<0.001, ** p<0.01, * p<0.05)", 
             fontsize=13, fontweight='bold', pad=20)
ax.set_xlabel("Recycling Metrics (Targets)", fontsize=11, fontweight='bold')
ax.set_ylabel("Socio-Economic Variables", fontsize=11, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig("03_correlation_matrix.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ============================================================================
# STEP 5: SCATTER PLOTS - Policy Drivers vs Recycling Rate
# ============================================================================

print("\n=== POLICY-LEVEL ANALYSIS: Driver-by-Driver Breakdown ===\n")

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

plot_vars = policy_drivers
plot_titles = [
    'Pollution/Environmental Tax per Capita (€)',
    'Private Investment per Capita (€)',
    'Government Effectiveness Score (-2.5 to +2.5)'
]

df_plot = df_clean_regression.copy()

for ax, var, title in zip(axes, plot_vars, plot_titles):
    ax.scatter(df_plot[var], df_plot['recycling_rate'], 
               s=150, alpha=0.6, edgecolor='black', linewidth=1, color='#3498db')
    
    # Add regression line
    z = np.polyfit(df_plot[var].dropna(), df_plot.loc[df_plot[var].notna(), 'recycling_rate'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(df_plot[var].min(), df_plot[var].max(), 100)
    ax.plot(x_line, p(x_line), 'r--', linewidth=2, alpha=0.7, label='Trend')
    
    # Correlation
    r, p_val = pearsonr(df_plot[var].dropna(), df_plot.loc[df_plot[var].notna(), 'recycling_rate'])
    sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else "ns"
    
    ax.set_xlabel(title, fontsize=11, fontweight='bold')
    ax.set_ylabel('General Recycling Rate (%)', fontsize=11, fontweight='bold')
    ax.set_title(f'r = {r:.3f} {sig}', fontsize=12, fontweight='bold', color='#e74c3c')
    ax.grid(alpha=0.3)

plt.suptitle('Impact of Policy Drivers on Recycling Rates', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig("04_policy_scatter_plots.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
vars_plot = ['recycling_rate', 'gdp_per_capita', 'government_effectiveness_estimate', 'priv_inv_per_capita']
df_avg = df.groupby('country_name')[vars_plot].mean().reset_index().dropna()

rename_map = {
    'recycling_rate': 'Recycling Rate (%)',
    'gdp_per_capita': 'GDP per Capita',
    'government_effectiveness_estimate': 'Gov. Effectiveness',
    'priv_inv_per_capita': 'Private Inv. (Capita)'
}
df_vis = df_avg.rename(columns=rename_map)

# --- 3. PAIRPLOT ---
sns.set(style="ticks")
g = sns.pairplot(
    df_vis.drop(columns=['country_name']), 
    diag_kind="kde",
    kind="reg",
    plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.6}}
)

g.fig.suptitle("Multivariate Relationships: Key Drivers vs Recycling", y=1.02, fontsize=16, fontweight='bold')
plt.savefig("mulvar_rel.png")
plt.show()