In [24]:
"""
IMPROVED ML ANALYSIS FOR ECONOMIC COMPLEXITY
- Panel data (all years, not just 2019)
- Temporal train/test split (train: 1995-2013, test: 2014-2019)
- Two model specifications: Resource Curse vs Full Structural
- HIGH RESOURCE COUNTRY interactions
- SHAP for interpretability
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os

warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

input_file = "/Users/leoss/Desktop/GitHub/Capstone/MASTER/Master.csv"
production_file = "/Users/leoss/Desktop/GitHub/Capstone/MASTER/NaturalResource.csv"
output_dir = "/Users/leoss/Desktop/Portfolio/Website-/capstone_visualizations/individual_plots/ml"

os.makedirs(output_dir, exist_ok=True)

# Temporal split configuration
TRAIN_END_YEAR = 2013  # Train: <=2013, Test: 2014-2019

# High resource country list
HIGH_RESOURCE_COUNTRIES = [
    'AGO', 'ARE', 'AZE', 'BFA', 'BHR', 'BOL', 'CHL', 'CIV', 'CMR',
    'COD', 'COG', 'DZA', 'ECU', 'EGY', 'ETH', 'GAB', 'GHA', 'GIN',
    'GNQ', 'IDN', 'IRN', 'IRQ', 'KAZ', 'KEN', 'KWT', 'LAO', 'LBR',
    'LBY', 'MDG', 'MLI', 'MMR', 'MNG', 'MOZ', 'MWI', 'MYS', 'NER',
    'NGA', 'OMN', 'PNG', 'QAT', 'RUS', 'RWA', 'SAU', 'TCD', 'TGO',
    'TTO', 'TZA', 'UGA', 'UZB', 'VEN', 'VNM', 'YEM', 'ZMB', 'ZWE'
]

print("="*70)
print("ML ANALYSIS: PREDICTING ECONOMIC COMPLEXITY")
print(f"Panel Data with Temporal Split (Train: ‚â§{TRAIN_END_YEAR}, Test: {TRAIN_END_YEAR+1}+)")
print("Including High Resource Country Interactions")
print("="*70)

# ============================================================================
# 1. DEFINE HELPER FUNCTIONS AND MAPPINGS FIRST
# ============================================================================

def clean_name(s):
    """Clean column names to remove special characters"""
    return (s.replace('‚Äî', '-')
             .replace('(', '')
             .replace(')', '')
             .replace('%', 'pct')
             .replace(',', ''))

# Clean feature names for display
feature_names_display = {
    'Oil_GDP_Pct': 'Oil (% GDP)',
    'Natural Gas_GDP_Pct': 'Natural Gas (% GDP)',
    'Coal_GDP_Pct': 'Coal (% GDP)',
    'Metals_GDP_Pct': 'Metals (% GDP)',
    'Human capital index': 'Human Capital',
    'Rule of law index': 'Rule of Law',
    'Property rights': 'Property Rights',
    'Political corruption index': 'Political Corruption',
    'Political stability - estimate': 'Political Stability',
    'Landlocked': 'Landlocked',
    'Manufacturing': 'Manufacturing (% GDP)',
    'Agriculture': 'Agriculture (% GDP)',
    'Trade pct of GDP': 'Trade Openness',
    'Gross fixed capital formation all Constant prices Percent of GDP': 'Investment (% GDP)',
    'Access to electricity pct of population': 'Electricity Access',
    'Urban population pct of total population': 'Urbanization',
    'Domestic credit to private sector pct of GDP': 'Private Credit',
    'Inflation consumer prices annual pct': 'Inflation',
    # Interaction terms
    'High_Resource': 'High Resource Country',
    'Oil_GDP_Pct_x_HighRes': 'Oil √ó High Resource',
    'NatGas_GDP_Pct_x_HighRes': 'Nat Gas √ó High Resource',
    'Coal_GDP_Pct_x_HighRes': 'Coal √ó High Resource',
    'Metals_GDP_Pct_x_HighRes': 'Metals √ó High Resource',
    'Total_Resources_x_HighRes': 'Total Resources √ó High Resource',
    'HCI_x_TotalResources': 'Human Capital √ó Total Resources',
}

# ============================================================================
# 2. LOAD AND PREPARE DATA
# ============================================================================

print("\n1. Loading data...")

df_master = pd.read_csv(input_file)

# Apply column name cleaning
df_master.columns = [clean_name(c) for c in df_master.columns]

df_prod = pd.read_csv(production_file)

print(f"   Master data: {len(df_master)} rows, {df_master['Country Code'].nunique()} countries")
print(f"   Years: {df_master['Year'].min()} - {df_master['Year'].max()}")

# Process production data to get resource categories
def categorize_resource(resource):
    if resource == 'Oil': return 'Oil'
    elif resource == 'Natural Gas': return 'Natural Gas'
    elif resource == 'Coal': return 'Coal'
    else: return 'Metals'

df_prod['Resource_Category'] = df_prod['Resource'].apply(categorize_resource)

prod_agg = df_prod.groupby(['Country Name', 'Year', 'Resource_Category'])['Production_TotalValue'].sum().reset_index()
prod_wide = prod_agg.pivot_table(
    index=['Country Name', 'Year'], 
    columns='Resource_Category', 
    values='Production_TotalValue', 
    fill_value=0
).reset_index()

# Merge production with master
df = df_master.merge(prod_wide, on=['Country Name', 'Year'], how='left')

# Fill missing production values with 0
for col in ['Oil', 'Natural Gas', 'Coal', 'Metals']:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Calculate GDP-normalized resource values
df['GDP_total'] = df['GDP per capita constant prices PPP'] * df['Population']
for res in ['Oil', 'Natural Gas', 'Coal', 'Metals']:
    if res in df.columns:
        df[f'{res}_GDP_Pct'] = (df[res] / df['GDP_total']) * 100
        df[f'{res}_GDP_Pct'] = df[f'{res}_GDP_Pct'].replace([np.inf, -np.inf], np.nan)

df['Total_Resources_GDP_Pct'] = df[['Oil_GDP_Pct', 'Natural Gas_GDP_Pct', 'Coal_GDP_Pct', 'Metals_GDP_Pct']].sum(axis=1)

# ============================================================================
# 3. CREATE HIGH RESOURCE DUMMY AND INTERACTIONS
# ============================================================================

print("\n2. Creating high resource country dummy and interactions...")

# Create dummy
df['High_Resource'] = df['Country Code'].isin(HIGH_RESOURCE_COUNTRIES).astype(int)



# HCI √ó Total Resources interaction (tests if human capital returns differ by resource intensity)
df['HCI_x_TotalResources'] = df['Human capital index'] * df['Total_Resources_GDP_Pct']

n_high_res = df[df['High_Resource'] == 1]['Country Code'].nunique()
n_other = df[df['High_Resource'] == 0]['Country Code'].nunique()
print(f"   High resource countries: {n_high_res}")
print(f"   Other countries: {n_other}")

print(f"   Merged data: {len(df)} rows")

# ============================================================================
# 4. DEFINE FEATURE SETS
# ============================================================================

print("\n3. Defining feature sets...")

# Target variable
target = 'Economic Complexity Index'

# Model 1: Resource Curse (baseline - no interactions)
features_resource_curse_raw = [
    'Oil_GDP_Pct',
    'Natural Gas_GDP_Pct', 
    'Coal_GDP_Pct',
    'Metals_GDP_Pct',
    'Human capital index',
    'Rule of law index',
    'Property rights',
    'Landlocked'
]

# Model 2: Resource Curse WITH High Resource dummy and interactions
features_rc_interactions = [
    'Oil_GDP_Pct',
    'Natural Gas_GDP_Pct', 
    'Coal_GDP_Pct',
    'Metals_GDP_Pct',
    'High_Resource',
    'Human capital index',
    'HCI_x_TotalResources',  
    'Rule of law index',
    'Property rights',
    'Landlocked'
]

# Model 3: Full Structural (no interactions)
features_full_raw = [
    'Oil_GDP_Pct',
    'Natural Gas_GDP_Pct', 
    'Coal_GDP_Pct',
    'Metals_GDP_Pct',
    'Manufacturing',
    'Agriculture',
    'Trade (% of GDP)',
    'Gross fixed capital formation, all, Constant prices, Percent of GDP',
    'Human capital index',
    'Access to electricity (% of population)',
    'Urban population (% of total population)',
    'Rule of law index',
    'Property rights',
    'Political stability ‚Äî estimate',
    'Domestic credit to private sector (% of GDP)',
    'Inflation, consumer prices (annual %)',
    'Landlocked',
]

# Clean feature names to match cleaned column names
features_resource_curse = [clean_name(f) for f in features_resource_curse_raw]
features_full = [clean_name(f) for f in features_full_raw]
# Interaction features don't need cleaning (already clean)

# Also clean the display names dictionary keys
feature_names_clean = {clean_name(k): v for k, v in feature_names_display.items()}

# Verify features exist in dataframe
print("\nChecking feature availability...")
all_features = set(features_resource_curse + features_rc_interactions + features_full)
missing = [f for f in all_features if f not in df.columns]

if missing:
    print(f"   ‚ö† Missing: {missing}")
else:
    print("   ‚úì All features found")

print(f"\n   Resource Curse (baseline): {len(features_resource_curse)} features")
print(f"   Resource Curse (with interactions): {len(features_rc_interactions)} features (incl. HCI√óResources)")
print(f"   Full Structural: {len(features_full)} features")

# ============================================================================
# 5. PREPARE DATASETS WITH TEMPORAL SPLIT
# ============================================================================

print("\n4. Preparing train/test split...")

# --- Resource Curse Baseline ---
df_rc = df[['Country Code', 'Country Name', 'Year', target] + features_resource_curse].copy()
df_rc = df_rc.dropna()

train_rc = df_rc[df_rc['Year'] <= TRAIN_END_YEAR]
test_rc = df_rc[df_rc['Year'] > TRAIN_END_YEAR]

X_train_rc = train_rc[features_resource_curse]
y_train_rc = train_rc[target]
X_test_rc = test_rc[features_resource_curse]
y_test_rc = test_rc[target]

print(f"\n   RESOURCE CURSE (BASELINE):")
print(f"   Train: {len(train_rc)} obs, Test: {len(test_rc)} obs")

# --- Resource Curse WITH Interactions ---
df_rc_int = df[['Country Code', 'Country Name', 'Year', target] + features_rc_interactions].copy()
df_rc_int = df_rc_int.dropna()

train_rc_int = df_rc_int[df_rc_int['Year'] <= TRAIN_END_YEAR]
test_rc_int = df_rc_int[df_rc_int['Year'] > TRAIN_END_YEAR]

X_train_rc_int = train_rc_int[features_rc_interactions]
y_train_rc_int = train_rc_int[target]
X_test_rc_int = test_rc_int[features_rc_interactions]
y_test_rc_int = test_rc_int[target]

print(f"\n   RESOURCE CURSE (WITH INTERACTIONS):")
print(f"   Train: {len(train_rc_int)} obs, Test: {len(test_rc_int)} obs")
print(f"   High Resource in test: {test_rc_int[test_rc_int['Country Code'].isin(HIGH_RESOURCE_COUNTRIES)]['Country Code'].nunique()} countries")

# --- Full Model ---
df_full = df[['Country Code', 'Country Name', 'Year', target] + features_full].copy()
df_full = df_full.dropna()

train_full = df_full[df_full['Year'] <= TRAIN_END_YEAR]
test_full = df_full[df_full['Year'] > TRAIN_END_YEAR]

X_train_full = train_full[features_full]
y_train_full = train_full[target]
X_test_full = test_full[features_full]
y_test_full = test_full[target]

print(f"\n   FULL STRUCTURAL MODEL:")
print(f"   Train: {len(train_full)} obs, Test: {len(test_full)} obs")

# Scale features
scaler_rc = StandardScaler()
X_train_rc_scaled = scaler_rc.fit_transform(X_train_rc)
X_test_rc_scaled = scaler_rc.transform(X_test_rc)

scaler_rc_int = StandardScaler()
X_train_rc_int_scaled = scaler_rc_int.fit_transform(X_train_rc_int)
X_test_rc_int_scaled = scaler_rc_int.transform(X_test_rc_int)

scaler_full = StandardScaler()
X_train_full_scaled = scaler_full.fit_transform(X_train_full)
X_test_full_scaled = scaler_full.transform(X_test_full)

# ============================================================================
# 6. TRAIN MODELS
# ============================================================================

print("\n5. Training models...")

def train_and_evaluate(X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, model_name):
    """Train multiple models and return results"""
    
    results = {}
    
    # Models that need scaling
    models_scaled = {
        'Ridge': Ridge(alpha=1.0, random_state=42),
        'Lasso': Lasso(alpha=0.1, random_state=42),
        'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
    }
    
    # Models that don't need scaling
    models_unscaled = {
        'Random Forest': RandomForestRegressor(
            n_estimators=200, max_depth=10, min_samples_split=10,
            min_samples_leaf=5, random_state=42, n_jobs=-1
        ),
        'Gradient Boosting': GradientBoostingRegressor(
            n_estimators=200, learning_rate=0.05, max_depth=5,
            min_samples_split=10, min_samples_leaf=5, random_state=42
        ),
        'XGBoost': xgb.XGBRegressor(
            n_estimators=200, learning_rate=0.05, max_depth=5,
            min_child_weight=5, subsample=0.8, colsample_bytree=0.8,
            random_state=42, objective='reg:squarederror', verbosity=0
        ),
        'LightGBM': lgb.LGBMRegressor(
            n_estimators=200, learning_rate=0.05, max_depth=5,
            num_leaves=31, min_child_samples=10, subsample=0.8,
            colsample_bytree=0.8, random_state=42, verbose=-1
        )
    }
    
    # Train scaled models
    for name, model in models_scaled.items():
        model.fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        
        results[name] = {
            'model': model,
            'train_r2': r2_score(y_train, y_pred_train),
            'test_r2': r2_score(y_test, y_pred_test),
            'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'test_mae': mean_absolute_error(y_test, y_pred_test),
            'predictions': y_pred_test,
            'scaled': True
        }
        
        if hasattr(model, 'coef_'):
            results[name]['coefficients'] = model.coef_
    
    # Train unscaled models
    for name, model in models_unscaled.items():
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        results[name] = {
            'model': model,
            'train_r2': r2_score(y_train, y_pred_train),
            'test_r2': r2_score(y_test, y_pred_test),
            'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'test_mae': mean_absolute_error(y_test, y_pred_test),
            'predictions': y_pred_test,
            'scaled': False
        }
        
        if hasattr(model, 'feature_importances_'):
            results[name]['feature_importance'] = model.feature_importances_
    
    return results

# Train all three models
print("\n   Training Resource Curse (baseline)...")
results_rc = train_and_evaluate(
    X_train_rc, X_test_rc, y_train_rc, y_test_rc,
    X_train_rc_scaled, X_test_rc_scaled, "Resource Curse"
)

print("   Training Resource Curse (with interactions)...")
results_rc_int = train_and_evaluate(
    X_train_rc_int, X_test_rc_int, y_train_rc_int, y_test_rc_int,
    X_train_rc_int_scaled, X_test_rc_int_scaled, "Resource Curse + Interactions"
)

print("   Training Full Structural...")
results_full = train_and_evaluate(
    X_train_full, X_test_full, y_train_full, y_test_full,
    X_train_full_scaled, X_test_full_scaled, "Full Structural"
)

# ============================================================================
# 7. CROSS-VALIDATION
# ============================================================================

print("\n6. Cross-validation (5-fold)...")

kf = KFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(results, X_train, X_train_scaled, y_train):
    cv_results = {}
    for name, res in results.items():
        if res['scaled']:
            X_cv = X_train_scaled
        else:
            X_cv = X_train
        
        cv_scores = cross_val_score(res['model'], X_cv, y_train, cv=kf, scoring='r2')
        cv_results[name] = {
            'mean': cv_scores.mean(),
            'std': cv_scores.std()
        }
    return cv_results

cv_rc = run_cv(results_rc, X_train_rc, X_train_rc_scaled, y_train_rc)
cv_rc_int = run_cv(results_rc_int, X_train_rc_int, X_train_rc_int_scaled, y_train_rc_int)
cv_full = run_cv(results_full, X_train_full, X_train_full_scaled, y_train_full)

# ============================================================================
# 8. RESULTS SUMMARY
# ============================================================================

print("\n" + "="*70)
print("MODEL COMPARISON - RESOURCE CURSE (BASELINE)")
print("="*70)

comparison_rc = []
for name, res in results_rc.items():
    comparison_rc.append({
        'Model': name,
        'Train R¬≤': res['train_r2'],
        'Test R¬≤': res['test_r2'],
        'CV R¬≤': cv_rc[name]['mean'],
        'Test RMSE': res['test_rmse'],
        'Overfit': res['train_r2'] - res['test_r2']
    })

df_comp_rc = pd.DataFrame(comparison_rc).sort_values('Test R¬≤', ascending=False)
print(df_comp_rc.to_string(index=False))

print("\n" + "="*70)
print("MODEL COMPARISON - RESOURCE CURSE (WITH INTERACTIONS)")
print("="*70)

comparison_rc_int = []
for name, res in results_rc_int.items():
    comparison_rc_int.append({
        'Model': name,
        'Train R¬≤': res['train_r2'],
        'Test R¬≤': res['test_r2'],
        'CV R¬≤': cv_rc_int[name]['mean'],
        'Test RMSE': res['test_rmse'],
        'Overfit': res['train_r2'] - res['test_r2']
    })

df_comp_rc_int = pd.DataFrame(comparison_rc_int).sort_values('Test R¬≤', ascending=False)
print(df_comp_rc_int.to_string(index=False))

print("\n" + "="*70)
print("MODEL COMPARISON - FULL STRUCTURAL MODEL")
print("="*70)

comparison_full = []
for name, res in results_full.items():
    comparison_full.append({
        'Model': name,
        'Train R¬≤': res['train_r2'],
        'Test R¬≤': res['test_r2'],
        'CV R¬≤': cv_full[name]['mean'],
        'Test RMSE': res['test_rmse'],
        'Overfit': res['train_r2'] - res['test_r2']
    })

df_comp_full = pd.DataFrame(comparison_full).sort_values('Test R¬≤', ascending=False)
print(df_comp_full.to_string(index=False))

# ============================================================================
# 9. INTERACTION EFFECTS ANALYSIS
# ============================================================================

print("\n" + "="*70)
print("INTERACTION EFFECTS ANALYSIS (RIDGE COEFFICIENTS)")
print("="*70)

# Ridge coefficients for interaction model
ridge_coefs_int = pd.DataFrame({
    'Feature': features_rc_interactions,
    'Coefficient': results_rc_int['Ridge']['coefficients']
}).sort_values('Coefficient', key=abs, ascending=False)

print("\n   RESOURCE CURSE + INTERACTIONS MODEL:")
print(ridge_coefs_int.to_string(index=False))

# Interpretation
print("\n" + "-"*50)
print("INTERPRETATION OF INTERACTION TERMS:")
print("-"*50)

# Extract key coefficients
coef_dict = dict(zip(features_rc_interactions, results_rc_int['Ridge']['coefficients']))

print(f"\n   Base effect of Oil (all countries):        {coef_dict.get('Oil_GDP_Pct', 0):.4f}")
print(f"   Additional effect for High Resource:       {coef_dict.get('Oil_GDP_Pct_x_HighRes', 0):.4f}")
print(f"   TOTAL effect for High Resource countries:  {coef_dict.get('Oil_GDP_Pct', 0) + coef_dict.get('Oil_GDP_Pct_x_HighRes', 0):.4f}")

print(f"\n   Base effect of Natural Gas:                {coef_dict.get('Natural Gas_GDP_Pct', 0):.4f}")
print(f"   Additional effect for High Resource:       {coef_dict.get('NatGas_GDP_Pct_x_HighRes', 0):.4f}")
print(f"   TOTAL effect for High Resource countries:  {coef_dict.get('Natural Gas_GDP_Pct', 0) + coef_dict.get('NatGas_GDP_Pct_x_HighRes', 0):.4f}")

print(f"\n   Base effect of Metals:                     {coef_dict.get('Metals_GDP_Pct', 0):.4f}")
print(f"   Additional effect for High Resource:       {coef_dict.get('Metals_GDP_Pct_x_HighRes', 0):.4f}")
print(f"   TOTAL effect for High Resource countries:  {coef_dict.get('Metals_GDP_Pct', 0) + coef_dict.get('Metals_GDP_Pct_x_HighRes', 0):.4f}")

print(f"\n   High Resource dummy (intercept shift):     {coef_dict.get('High_Resource', 0):.4f}")

print(f"\n   HCI √ó Total Resources interaction:         {coef_dict.get('HCI_x_TotalResources', 0):.4f}")
if coef_dict.get('HCI_x_TotalResources', 0) > 0:
    print(f"   ‚Üí Human capital returns AMPLIFIED in resource-rich contexts")
else:
    print(f"   ‚Üí Human capital returns DIMINISHED in resource-rich contexts")

# ============================================================================
# 10. FEATURE IMPORTANCE / COEFFICIENTS
# ============================================================================

print("\n" + "="*70)
print("FEATURE ANALYSIS")
print("="*70)

# Get best tree model for each
tree_models = ['Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']
best_tree_rc = df_comp_rc[df_comp_rc['Model'].isin(tree_models)].iloc[0]['Model']
best_tree_rc_int = df_comp_rc_int[df_comp_rc_int['Model'].isin(tree_models)].iloc[0]['Model']
best_tree_full = df_comp_full[df_comp_full['Model'].isin(tree_models)].iloc[0]['Model']

print(f"\n   Best tree model (RC baseline): {best_tree_rc}")
print(f"   Best tree model (RC + interactions): {best_tree_rc_int}")
print(f"   Best tree model (Full): {best_tree_full}")

# Ridge coefficients (baseline)
print("\n   RIDGE COEFFICIENTS (Resource Curse Baseline):")
ridge_coefs_rc = pd.DataFrame({
    'Feature': features_resource_curse,
    'Coefficient': results_rc['Ridge']['coefficients']
}).sort_values('Coefficient', key=abs, ascending=False)
print(ridge_coefs_rc.to_string(index=False))

# ============================================================================
# 11. VISUALIZATIONS
# ============================================================================

print("\n7. Creating visualizations...")

# ------------------------------
# A. MODEL COMPARISON (all 3)
# ------------------------------

fig_comp = make_subplots(
    rows=1, cols=3,
    subplot_titles=('RC Baseline', 'RC + Interactions', 'Full Structural')
)

# RC Baseline
df_comp_rc_sorted = df_comp_rc.sort_values('Test R¬≤', ascending=True)
fig_comp.add_trace(
    go.Bar(
        y=df_comp_rc_sorted['Model'],
        x=df_comp_rc_sorted['Test R¬≤'],
        orientation='h',
        marker_color='#002A54',
        text=[f"{x:.3f}" for x in df_comp_rc_sorted['Test R¬≤']],
        textposition='outside',
        name='RC Baseline'
    ),
    row=1, col=1
)

# RC + Interactions
df_comp_rc_int_sorted = df_comp_rc_int.sort_values('Test R¬≤', ascending=True)
fig_comp.add_trace(
    go.Bar(
        y=df_comp_rc_int_sorted['Model'],
        x=df_comp_rc_int_sorted['Test R¬≤'],
        orientation='h',
        marker_color='#6B8E23',
        text=[f"{x:.3f}" for x in df_comp_rc_int_sorted['Test R¬≤']],
        textposition='outside',
        name='RC + Interactions'
    ),
    row=1, col=2
)

# Full Model
df_comp_full_sorted = df_comp_full.sort_values('Test R¬≤', ascending=True)
fig_comp.add_trace(
    go.Bar(
        y=df_comp_full_sorted['Model'],
        x=df_comp_full_sorted['Test R¬≤'],
        orientation='h',
        marker_color='#E30613',
        text=[f"{x:.3f}" for x in df_comp_full_sorted['Test R¬≤']],
        textposition='outside',
        name='Full Model'
    ),
    row=1, col=3
)

fig_comp.update_layout(
    title=dict(text='Model Performance Comparison (Test R¬≤)', x=0.5, font=dict(size=18)),
    height=500,
    showlegend=False,
    template='plotly_white'
)
fig_comp.update_xaxes(range=[0, 1])

fig_comp.write_html(os.path.join(output_dir, 'ml_model_comparison.html'))
print("   ‚úì Model comparison chart saved")

# ------------------------------
# B. INTERACTION COEFFICIENTS PLOT
# ------------------------------

coef_int = pd.DataFrame({
    'Feature': [feature_names_clean.get(f, f) for f in features_rc_interactions],
    'Coefficient': results_rc_int['Ridge']['coefficients']
}).sort_values('Coefficient')

fig_coef_int = go.Figure()
fig_coef_int.add_trace(go.Bar(
    y=coef_int['Feature'],
    x=coef_int['Coefficient'],
    orientation='h',
    marker_color=['#22c55e' if c > 0 else '#ef4444' for c in coef_int['Coefficient']],
    text=[f"{x:.3f}" for x in coef_int['Coefficient']],
    textposition='outside'
))
fig_coef_int.add_vline(x=0, line_dash='dash', line_color='black')
fig_coef_int.update_layout(
    title=dict(text='Ridge Coefficients - Resource Curse with Interactions<br><sup>Standardized coefficients</sup>', x=0.5),
    xaxis_title='Coefficient (Standardized)',
    height=600,
    template='plotly_white',
    margin=dict(l=200)
)
fig_coef_int.write_html(os.path.join(output_dir, 'ml_coefficients_interactions.html'))
print("   ‚úì Interaction coefficient plot saved")

# ------------------------------
# C. FEATURE IMPORTANCE (Interaction Model)
# ------------------------------

importance_int = pd.DataFrame({
    'Feature': [feature_names_clean.get(f, f) for f in features_rc_interactions],
    'Importance': results_rc_int[best_tree_rc_int]['feature_importance']
}).sort_values('Importance', ascending=True)

fig_imp_int = go.Figure()
fig_imp_int.add_trace(go.Bar(
    y=importance_int['Feature'],
    x=importance_int['Importance'],
    orientation='h',
    marker_color='#6B8E23',
    text=[f"{x:.3f}" for x in importance_int['Importance']],
    textposition='outside'
))
fig_imp_int.update_layout(
    title=dict(text=f'Feature Importance - RC + Interactions ({best_tree_rc_int})', x=0.5),
    xaxis_title='Importance',
    height=600,
    template='plotly_white',
    margin=dict(l=200)
)
fig_imp_int.write_html(os.path.join(output_dir, 'ml_feature_importance_interactions.html'))
print("   ‚úì Feature importance (interactions) saved")

# ------------------------------
# D. SHAP ANALYSIS (Interaction Model)
# ------------------------------

print("\n8. SHAP analysis...")

try:
    import shap
    
    # Use LightGBM for SHAP (more stable than XGBoost)
    best_model_shap = results_rc_int['LightGBM']['model']
    
    explainer = shap.TreeExplainer(best_model_shap)
    shap_values = explainer.shap_values(X_test_rc_int.astype(float))
    
    shap_df = pd.DataFrame({
        'Feature': [feature_names_clean.get(f, f) for f in features_rc_interactions],
        'Mean_SHAP': np.abs(shap_values).mean(axis=0)
    }).sort_values('Mean_SHAP', ascending=True)
    
    fig_shap = go.Figure()
    fig_shap.add_trace(go.Bar(
        y=shap_df['Feature'],
        x=shap_df['Mean_SHAP'],
        orientation='h',
        marker_color='#9b59b6',
        text=[f"{x:.3f}" for x in shap_df['Mean_SHAP']],
        textposition='outside'
    ))
    fig_shap.update_layout(
        title=dict(text='SHAP Feature Importance - RC + Interactions (LightGBM)', x=0.5),
        xaxis_title='Mean |SHAP Value|',
        height=600,
        template='plotly_white',
        margin=dict(l=200)
    )
    fig_shap.write_html(os.path.join(output_dir, 'ml_shap_importance.html'))
    print("   ‚úì SHAP analysis saved")
    
except ImportError:
    print("   ‚ö† SHAP not installed.")
except Exception as e:
    print(f"   ‚ö† SHAP error: {e}")

# ============================================================================
# 12. SAVE RESULTS
# ============================================================================

print("\n9. Saving results...")

# Save comparison tables
df_comp_rc.to_csv(os.path.join(output_dir, 'ml_comparison_resource_curse.csv'), index=False)
df_comp_rc_int.to_csv(os.path.join(output_dir, 'ml_comparison_rc_interactions.csv'), index=False)
df_comp_full.to_csv(os.path.join(output_dir, 'ml_comparison_full.csv'), index=False)

# Save coefficients
ridge_coefs_rc.to_csv(os.path.join(output_dir, 'ml_ridge_coefficients_rc.csv'), index=False)
ridge_coefs_int.to_csv(os.path.join(output_dir, 'ml_ridge_coefficients_interactions.csv'), index=False)

# Save feature importance
importance_int.to_csv(os.path.join(output_dir, 'ml_feature_importance_interactions.csv'), index=False)

print("   ‚úì All results saved")

# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "="*70)
print("‚úÖ ML ANALYSIS COMPLETE!")
print("="*70)

print(f"\nüìä RESOURCE CURSE (BASELINE)")
print(f"   Best model: {df_comp_rc.iloc[0]['Model']}")
print(f"   Test R¬≤: {df_comp_rc.iloc[0]['Test R¬≤']:.3f}")

print(f"\nüìä RESOURCE CURSE (WITH INTERACTIONS)")
print(f"   Best model: {df_comp_rc_int.iloc[0]['Model']}")
print(f"   Test R¬≤: {df_comp_rc_int.iloc[0]['Test R¬≤']:.3f}")

print(f"\nüìä FULL STRUCTURAL MODEL")
print(f"   Best model: {df_comp_full.iloc[0]['Model']}")
print(f"   Test R¬≤: {df_comp_full.iloc[0]['Test R¬≤']:.3f}")

# Key finding: does interaction improve?
baseline_r2 = df_comp_rc.iloc[0]['Test R¬≤']
interaction_r2 = df_comp_rc_int.iloc[0]['Test R¬≤']
improvement = interaction_r2 - baseline_r2

print(f"\nüîë INTERACTION EFFECT:")
print(f"   R¬≤ improvement from interactions: {improvement:+.3f}")
if improvement > 0.01:
    print(f"   ‚Üí Interactions ADD predictive value")
else:
    print(f"   ‚Üí Interactions provide MINIMAL improvement")

print(f"\nüìÅ Outputs saved to: {output_dir}")
print("="*70)

ML ANALYSIS: PREDICTING ECONOMIC COMPLEXITY
Panel Data with Temporal Split (Train: ‚â§2013, Test: 2014+)
Including High Resource Country Interactions

1. Loading data...
   Master data: 3150 rows, 126 countries
   Years: 1995 - 2019

2. Creating high resource country dummy and interactions...
   High resource countries: 54
   Other countries: 72
   Merged data: 3150 rows

3. Defining feature sets...

Checking feature availability...
   ‚úì All features found

   Resource Curse (baseline): 8 features
   Resource Curse (with interactions): 10 features (incl. HCI√óResources)
   Full Structural: 17 features

4. Preparing train/test split...

   RESOURCE CURSE (BASELINE):
   Train: 2394 obs, Test: 756 obs

   RESOURCE CURSE (WITH INTERACTIONS):
   Train: 2394 obs, Test: 756 obs
   High Resource in test: 54 countries

   FULL STRUCTURAL MODEL:
   Train: 2394 obs, Test: 756 obs

5. Training models...

   Training Resource Curse (baseline)...
   Training Resource Curse (with interactions)...
 