In [8]:
"""
COMPLETE NATURAL RESOURCE ANALYSIS - FINAL VERSION
Using correct NaturalResource.csv with Production_TotalValue
Output: /Users/leoss/Desktop/Portfolio/Website-/Capstone-Proj/individual_plots
"""

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os

# ============================================================================
# CONFIGURATION
# ============================================================================

input_file = "/Users/leoss/Desktop/GitHub/Capstone/MASTER/Master.csv"
production_file = "/Users/leoss/Desktop/GitHub/Capstone/MASTER/NaturalResource.csv"
output_dir = "/Users/leoss/Desktop/Portfolio/Website-/capstone_visualizations/individual_plots"

os.makedirs(output_dir, exist_ok=True)

print("="*70)
print("NATURAL RESOURCE ANALYSIS - COMPLETE")
print("="*70)

# ============================================================================
# 1. LOAD AND PREPARE PRODUCTION DATA
# ============================================================================

print("\n1. Loading production data...")

df_prod = pd.read_csv(production_file)
df_master = pd.read_csv(input_file)

print(f"   Production data: {len(df_prod)} rows")
print(f"   Using Production_TotalValue (already in USD)")

# Categorize resources into 4 groups
def categorize_resource(resource):
    if resource == 'Oil': return 'Oil'
    elif resource == 'Natural Gas': return 'Natural Gas'
    elif resource == 'Coal': return 'Coal'
    else: return 'Metals'  # All other minerals

df_prod['Resource_Category'] = df_prod['Resource'].apply(categorize_resource)

# Aggregate Production_TotalValue by category, country, year
prod_agg = df_prod.groupby(['Country Name', 'Year', 'Resource_Category'])['Production_TotalValue'].sum().reset_index()

# Pivot to wide format
prod_wide = prod_agg.pivot_table(
    index=['Country Name', 'Year'], 
    columns='Resource_Category', 
    values='Production_TotalValue', 
    fill_value=0
).reset_index()

# Calculate Total
resource_cols = [c for c in prod_wide.columns if c not in ['Country Name', 'Year']]
prod_wide['Total'] = prod_wide[resource_cols].sum(axis=1)

print(f"   Aggregated: {len(prod_wide)} country-years")

# Merge with master data
master_data = df_master[[
    'Country Name', 'Year', 'Country Code', 'Population',
    'GDP per capita (constant prices, PPP)', 
    'Economic Complexity Index', 'Human capital index', 'Manufacturing'
]].copy()

map_data = prod_wide.merge(master_data, on=['Country Name', 'Year'], how='inner')

print(f"   Merged: {len(map_data)} rows, {map_data['Country Code'].nunique()} countries")

# Calculate GDP and derived metrics
map_data['GDP_total'] = map_data['GDP per capita (constant prices, PPP)'] * map_data['Population']

for res in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
    if res in map_data.columns:
        map_data[f'{res}_Per_Capita'] = map_data[res] / map_data['Population']
        map_data[f'{res}_GDP_Norm'] = (map_data[res] / map_data['GDP_total']) * 100

# ============================================================================
# 2. CREATE PRODUCTION MAP WITH SYNCED DROPDOWNS
# ============================================================================

print("\n2. Creating production map...")

# Create all 15 traces (5 resources Ã— 3 normalizations)
initial_data = map_data[map_data['Year'] == 2019]
traces = []

for resource in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
    for suffix, norm_type, units in [('', 'absolute', 'USD'), 
                                     ('_Per_Capita', 'per_capita', 'USD/person'), 
                                     ('_GDP_Norm', 'gdp_norm', '% GDP')]:
        col = f'{resource}{suffix}'
        z = initial_data[col].fillna(0)
        
        # Format hover text
        if norm_type == 'absolute':
            hover = [f"${v/1e9:.2f}B" if v >= 1e9 else f"${v/1e6:.1f}M" if v >= 1e6 else f"${v:,.0f}" for v in z]
        elif norm_type == 'per_capita':
            hover = [f"${v:,.0f}" for v in z]
        else:
            hover = [f"{v:.2f}%" for v in z]
        
        traces.append(go.Choropleth(
            locations=initial_data['Country Code'],
            z=z,
            text=initial_data['Country Name'],
            customdata=hover,
            colorscale='YlOrRd',
            marker=dict(line=dict(color='#999999', width=0.5)),
            colorbar=dict(title=units, len=0.7),
            hovertemplate=f'<b>%{{text}}</b><br>{resource}: %{{customdata}}<extra></extra>',
            visible=False
        ))

traces[0].visible = True  # Start with Total + Absolute

# Create slider steps that update data for all years
slider_steps = []
for year in sorted(map_data['Year'].unique()):
    year_data = map_data[map_data['Year'] == year]
    z_list, hover_list = [], []
    
    for resource in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
        for suffix, norm_type in [('', 'absolute'), ('_Per_Capita', 'per_capita'), ('_GDP_Norm', 'gdp_norm')]:
            col = f'{resource}{suffix}'
            z = year_data[col].fillna(0)
            
            # Format hover for this year
            if norm_type == 'absolute':
                hover = [f"${v/1e9:.2f}B" if v >= 1e9 else f"${v/1e6:.1f}M" if v >= 1e6 else f"${v:,.0f}" for v in z]
            elif norm_type == 'per_capita':
                hover = [f"${v:,.0f}" for v in z]
            else:
                hover = [f"{v:.2f}%" for v in z]
            
            z_list.append(z.tolist())
            hover_list.append(hover)
    
    slider_steps.append({
        'method': 'restyle',
        'args': [{
            'z': z_list,
            'customdata': hover_list,
            'locations': [year_data['Country Code'].tolist()] * 15,
            'text': [year_data['Country Name'].tolist()] * 15
        }],
        'label': str(year)
    })

# Create figure
fig = go.Figure(data=traces)

fig.update_layout(
    sliders=[{
        'active': len(slider_steps) - 1,
        'yanchor': 'top',
        'xanchor': 'left',
        'currentvalue': {
            'prefix': 'Year: ',
            'visible': True,
            'xanchor': 'center',
            'font': {'size': 18, 'color': '#002A54'}
        },
        'pad': {'b': 10, 't': 50},
        'len': 0.9,
        'x': 0.05,
        'y': 0,
        'steps': slider_steps,
        'transition': {'duration': 0}
    }],
    title={
        'text': "Natural Resource Production",
        'x': 0.5,
        'font': {'size': 22, 'color': '#002A54'}
    },
    geo=dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor='#aaaaaa',
        projection_type='natural earth',
        bgcolor='#e3f2fd',
        showland=True,
        landcolor='#fafafa',
        showcountries=True,
        countrycolor='#999999',
        countrywidth=0.5
    ),
    height=700,
    margin={"r":50,"t":120,"l":50,"b":120}
)

# Save with custom controls
fig_html = fig.to_html(include_plotlyjs='cdn', config={'displayModeBar': False})

controls_html = """
<div style="position: fixed; top: 20px; left: 20px; z-index: 1000; background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); border: 2px solid #002A54;">
    <label style="font-weight: 600; color: #002A54; margin-right: 10px;">Resource:</label>
    <select id="resourceSelect" style="padding: 8px; border: 2px solid #002A54; border-radius: 4px; font-size: 14px;">
        <option value="0">Total</option>
        <option value="1">Oil</option>
        <option value="2">Natural Gas</option>
        <option value="3">Coal</option>
        <option value="4">Metals</option>
    </select>
</div>
<div style="position: fixed; top: 20px; right: 20px; z-index: 1000; background: white; padding: 15px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); border: 2px solid #E30613;">
    <label style="font-weight: 600; color: #E30613; margin-right: 10px;">View:</label>
    <select id="normSelect" style="padding: 8px; border: 2px solid #E30613; border-radius: 4px; font-size: 14px;">
        <option value="0">Absolute</option>
        <option value="1">Per Capita</option>
        <option value="2">% of GDP</option>
    </select>
</div>
<script>
let currentResource = 0;
let currentNorm = 0;

function updateMap() {
    const vis = Array(15).fill(false);
    vis[currentResource * 3 + currentNorm] = true;
    const plotDiv = document.getElementsByClassName('plotly-graph-div')[0];
    if (plotDiv) {
        Plotly.restyle(plotDiv, {visible: vis});
    }
}

setTimeout(function() {
    document.getElementById('resourceSelect').addEventListener('change', function() {
        currentResource = parseInt(this.value);
        updateMap();
    });
    
    document.getElementById('normSelect').addEventListener('change', function() {
        currentNorm = parseInt(this.value);
        updateMap();
    });
}, 100);
</script>
"""

fig_html = fig_html.replace('<body>', '<body>' + controls_html)

with open(os.path.join(output_dir, 'map_production.html'), 'w') as f:
    f.write(fig_html)

print("   âœ“ Production map saved")

# ============================================================================
# 3. CLUSTERING ANALYSIS
# ============================================================================

print("\n3. Clustering analysis...")

cluster_year = 2019
cluster_data = map_data[map_data['Year'] == cluster_year].copy()

feature_cols = [
    'Metals_GDP_Norm', 'Oil_GDP_Norm', 'Natural Gas_GDP_Norm', 'Coal_GDP_Norm',
    'Economic Complexity Index', 'Human capital index'
]

cluster_subset = cluster_data[['Country Code', 'Country Name'] + feature_cols].dropna()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(cluster_subset[feature_cols])

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
cluster_subset['Cluster'] = kmeans.fit_predict(X_pca)
cluster_subset['PC1'] = X_pca[:, 0]
cluster_subset['PC2'] = X_pca[:, 1]

print(f"   Clusters: {cluster_subset['Cluster'].value_counts().sort_index().to_dict()}")

cluster_subset.to_csv(os.path.join(output_dir, 'cluster_assignments.csv'), index=False)

# ============================================================================
# 4. CLUSTER MAP
# ============================================================================

# Discrete qualitative colorscale (no visual ordering)
cluster_colorscale = [
    [0.00, '#1b9e77'], [0.20, '#1b9e77'],  # Cluster 0
    [0.20, '#d95f02'], [0.40, '#d95f02'],  # Cluster 1
    [0.40, '#7570b3'], [0.60, '#7570b3'],  # Cluster 2
    [0.60, '#e7298a'], [0.80, '#e7298a'],  # Cluster 3
    [0.80, '#66a61e'], [1.00, '#66a61e'],  # Cluster 4
]

fig_cluster = go.Figure(data=go.Choropleth(
    locations=cluster_subset['Country Code'],
    z=cluster_subset['Cluster'],
    text=cluster_subset['Country Name'],
    colorscale=cluster_colorscale,
    zmin=0,
    zmax=4,
    showscale=False,  # ðŸ”¥ removes misleading legend
    marker=dict(line=dict(color='#999999', width=0.4)),
    hovertemplate='<b>%{text}</b><br>Cluster: %{z}<extra></extra>'
))

fig_cluster.update_layout(
    geo=dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor='#b0b0b0',
        projection_type='natural earth',
        bgcolor='#f5f7fa',
        landcolor='#ffffff',
        countrycolor='#9e9e9e',
        countrywidth=0.5
    ),
    height=900,
    margin=dict(l=20, r=20, t=20, b=20)  # ðŸ”§ key fix
)

fig_cluster.write_html(os.path.join(output_dir, 'map_clusters.html'))
print("   âœ“ Cluster map saved (discrete, no fake scale)")
# ============================================================================
# 5. PCA LOADINGS
# ============================================================================

print("\n5. Creating PCA loadings plots...")

loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=feature_cols)

# PC1
fig_pc1 = go.Figure()
loadings_pc1 = loadings['PC1'].sort_values()
fig_pc1.add_trace(go.Bar(
    y=loadings_pc1.index, x=loadings_pc1.values, orientation='h',
    marker_color=['#2ecc71' if x > 0 else '#e74c3c' for x in loadings_pc1],
    text=[f"{x:.3f}" for x in loadings_pc1.values], textposition='outside'
))
fig_pc1.update_layout(
    title=f"PC1 Loadings ({pca.explained_variance_ratio_[0]:.1%} variance)",
    xaxis_title="Loading", height=500, showlegend=False,
    xaxis=dict(zeroline=True, zerolinewidth=2, zerolinecolor='black'),
    template='plotly_white'
)
fig_pc1.write_html(os.path.join(output_dir, 'pca_loadings_pc1.html'))

# PC2
fig_pc2 = go.Figure()
loadings_pc2 = loadings['PC2'].sort_values()
fig_pc2.add_trace(go.Bar(
    y=loadings_pc2.index, x=loadings_pc2.values, orientation='h',
    marker_color=['#2ecc71' if x > 0 else '#e74c3c' for x in loadings_pc2],
    text=[f"{x:.3f}" for x in loadings_pc2.values], textposition='outside'
))
fig_pc2.update_layout(
    title=f"PC2 Loadings ({pca.explained_variance_ratio_[1]:.1%} variance)",
    xaxis_title="Loading", height=500, showlegend=False,
    xaxis=dict(zeroline=True, zerolinewidth=2, zerolinecolor='black'),
    template='plotly_white'
)
fig_pc2.write_html(os.path.join(output_dir, 'pca_loadings_pc2.html'))

loadings.to_csv(os.path.join(output_dir, 'pca_loadings.csv'))
print("   âœ“ PCA loadings saved")

# ============================================================================
# 6. SCATTER PLOT
# ============================================================================

print("\n6. Creating PCA scatter plot...")

fig_scatter = px.scatter(
    cluster_subset,
    x='PC1', y='PC2',
    color='Cluster',
    hover_data=['Country Name'],
    title='Countries by Resource Production Patterns (PCA)',
    color_continuous_scale='Viridis',
    labels={'PC1': 'PC1 (Economic Development)', 'PC2': 'PC2 (Hydrocarbon Production)'}
)
fig_scatter.update_traces(marker=dict(size=10, line=dict(width=1, color='white')))
fig_scatter.update_layout(height=600, template='plotly_white')
fig_scatter.write_html(os.path.join(output_dir, 'scatter_pca.html'))
print("   âœ“ Scatter plot saved")

# ============================================================================
# 7. SAVE SUMMARY DATA
# ============================================================================

print("\n7. Saving data files...")

# Top countries analysis
top_countries = []
for resource in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
    for suffix, label in [('', 'Absolute'), ('_Per_Capita', 'Per Capita'), ('_GDP_Norm', 'GDP Norm')]:
        col = f'{resource}{suffix}'
        if col in initial_data.columns:
            top10 = initial_data.nlargest(10, col)[['Country Name', col]].copy()
            top10['Resource'] = resource
            top10['Normalization'] = label
            top10['Rank'] = range(1, 11)
            top10 = top10.rename(columns={col: 'Value'})
            top_countries.append(top10)

pd.concat(top_countries).to_csv(os.path.join(output_dir, 'top_countries_2019.csv'), index=False)
print("   âœ“ Top countries saved")

# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "="*70)
print("âœ… COMPLETE!")
print("="*70)
print(f"\nOutput directory: {output_dir}")
print(f"\nFiles created:")
print(f"  1. map_production.html - Interactive map (Total/Oil/Gas/Coal/Metals)")
print(f"  2. map_clusters.html - Cluster choropleth")
print(f"  3. pca_loadings_pc1.html - PC1 loadings")
print(f"  4. pca_loadings_pc2.html - PC2 loadings")
print(f"  5. scatter_pca.html - PCA scatter")
print(f"  6. cluster_assignments.csv - Cluster data")
print(f"  7. top_countries_2019.csv - Rankings")
print(f"\nData summary:")
print(f"  â€¢ {map_data['Country Code'].nunique()} countries")
print(f"  â€¢ {map_data['Year'].nunique()} years (1995-2019)")
print(f"  â€¢ 5 clusters identified")
print(f"  â€¢ {pca.explained_variance_ratio_.sum():.1%} variance explained by PCA")
print("="*70)

NATURAL RESOURCE ANALYSIS - COMPLETE

1. Loading production data...
   Production data: 17166 rows
   Using Production_TotalValue (already in USD)
   Aggregated: 3346 country-years
   Merged: 3069 rows, 126 countries

2. Creating production map...
   âœ“ Production map saved

3. Clustering analysis...
   Clusters: {0: 38, 1: 23, 2: 41, 3: 8, 4: 15}
   âœ“ Cluster map saved (discrete, no fake scale)

5. Creating PCA loadings plots...
   âœ“ PCA loadings saved

6. Creating PCA scatter plot...
   âœ“ Scatter plot saved

7. Saving data files...
   âœ“ Top countries saved

âœ… COMPLETE!

Output directory: /Users/leoss/Desktop/Portfolio/Website-/capstone_visualizations/individual_plots

Files created:
  1. map_production.html - Interactive map (Total/Oil/Gas/Coal/Metals)
  2. map_clusters.html - Cluster choropleth
  3. pca_loadings_pc1.html - PC1 loadings
  4. pca_loadings_pc2.html - PC2 loadings
  5. scatter_pca.html - PCA scatter
  6. cluster_assignments.csv - Cluster data
  7. top_countri

In [9]:
"""
MACHINE LEARNING ENSEMBLE METHODS
Predict Economic Complexity Index (ECI)
Add this section after clustering analysis in your existing code
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

print("\n" + "="*70)
print("MACHINE LEARNING: PREDICTING ECONOMIC COMPLEXITY")
print("="*70)

# ============================================================================
# 1. PREPARE ML DATASET
# ============================================================================

print("\n1. Preparing ML dataset...")

# Use the cluster_data from your existing code
# Features: Production + HCI (excluding ECI since that's what we predict)
ml_features = [
    'Oil_GDP_Norm',
    'Natural Gas_GDP_Norm', 
    'Coal_GDP_Norm',
    'Metals_GDP_Norm',
    'Human capital index'
]

ml_target = 'Economic Complexity Index'

# Prepare dataset
ml_data = cluster_data[[ml_target] + ml_features].dropna()

print(f"   Dataset: {len(ml_data)} observations")
print(f"   Features: {len(ml_features)}")
print(f"   Target: {ml_target}")
print(f"   Target range: [{ml_data[ml_target].min():.2f}, {ml_data[ml_target].max():.2f}]")

# Split features and target
X = ml_data[ml_features]
y = ml_data[ml_target]

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"   Train: {len(X_train)} | Test: {len(X_test)}")

# Scale features
scaler_ml = StandardScaler()
X_train_scaled = scaler_ml.fit_transform(X_train)
X_test_scaled = scaler_ml.transform(X_test)

# ============================================================================
# 2. TRAIN MODELS
# ============================================================================

print("\n2. Training models...")

results = {}

# ------------------------------
# A. OLS LINEAR REGRESSION (Baseline)
# ------------------------------
print("\n   A. OLS Linear Regression (Baseline)")
ols = LinearRegression()
ols.fit(X_train_scaled, y_train)

y_pred_ols_train = ols.predict(X_train_scaled)
y_pred_ols_test = ols.predict(X_test_scaled)

results['OLS'] = {
    'model': ols,
    'train_r2': r2_score(y_train, y_pred_ols_train),
    'test_r2': r2_score(y_test, y_pred_ols_test),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_ols_train)),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_ols_test)),
    'train_mae': mean_absolute_error(y_train, y_pred_ols_train),
    'test_mae': mean_absolute_error(y_test, y_pred_ols_test),
    'predictions': y_pred_ols_test
}

print(f"      Train RÂ²: {results['OLS']['train_r2']:.4f} | Test RÂ²: {results['OLS']['test_r2']:.4f}")
print(f"      Test RMSE: {results['OLS']['test_rmse']:.4f}")

# ------------------------------
# B. RIDGE REGRESSION (L2 regularization)
# ------------------------------
print("\n   B. Ridge Regression")
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_train_scaled, y_train)

y_pred_ridge_train = ridge.predict(X_train_scaled)
y_pred_ridge_test = ridge.predict(X_test_scaled)

results['Ridge'] = {
    'model': ridge,
    'train_r2': r2_score(y_train, y_pred_ridge_train),
    'test_r2': r2_score(y_test, y_pred_ridge_test),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_ridge_train)),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_ridge_test)),
    'train_mae': mean_absolute_error(y_train, y_pred_ridge_train),
    'test_mae': mean_absolute_error(y_test, y_pred_ridge_test),
    'predictions': y_pred_ridge_test
}

print(f"      Train RÂ²: {results['Ridge']['train_r2']:.4f} | Test RÂ²: {results['Ridge']['test_r2']:.4f}")
print(f"      Test RMSE: {results['Ridge']['test_rmse']:.4f}")

# ------------------------------
# C. RANDOM FOREST
# ------------------------------
print("\n   C. Random Forest")
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)  # No scaling needed for RF

y_pred_rf_train = rf.predict(X_train)
y_pred_rf_test = rf.predict(X_test)

results['Random Forest'] = {
    'model': rf,
    'train_r2': r2_score(y_train, y_pred_rf_train),
    'test_r2': r2_score(y_test, y_pred_rf_test),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_rf_train)),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_rf_test)),
    'train_mae': mean_absolute_error(y_train, y_pred_rf_train),
    'test_mae': mean_absolute_error(y_test, y_pred_rf_test),
    'predictions': y_pred_rf_test,
    'feature_importance': dict(zip(ml_features, rf.feature_importances_))
}

print(f"      Train RÂ²: {results['Random Forest']['train_r2']:.4f} | Test RÂ²: {results['Random Forest']['test_r2']:.4f}")
print(f"      Test RMSE: {results['Random Forest']['test_rmse']:.4f}")

# ------------------------------
# D. GRADIENT BOOSTING (Sklearn)
# ------------------------------
print("\n   D. Gradient Boosting")
gb = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)
gb.fit(X_train, y_train)

y_pred_gb_train = gb.predict(X_train)
y_pred_gb_test = gb.predict(X_test)

results['Gradient Boosting'] = {
    'model': gb,
    'train_r2': r2_score(y_train, y_pred_gb_train),
    'test_r2': r2_score(y_test, y_pred_gb_test),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_gb_train)),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_gb_test)),
    'train_mae': mean_absolute_error(y_train, y_pred_gb_train),
    'test_mae': mean_absolute_error(y_test, y_pred_gb_test),
    'predictions': y_pred_gb_test,
    'feature_importance': dict(zip(ml_features, gb.feature_importances_))
}

print(f"      Train RÂ²: {results['Gradient Boosting']['train_r2']:.4f} | Test RÂ²: {results['Gradient Boosting']['test_r2']:.4f}")
print(f"      Test RMSE: {results['Gradient Boosting']['test_rmse']:.4f}")

# ------------------------------
# E. XGBOOST
# ------------------------------
print("\n   E. XGBoost")
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective='reg:squarederror'
)
xgb_model.fit(X_train, y_train, verbose=False)

y_pred_xgb_train = xgb_model.predict(X_train)
y_pred_xgb_test = xgb_model.predict(X_test)

results['XGBoost'] = {
    'model': xgb_model,
    'train_r2': r2_score(y_train, y_pred_xgb_train),
    'test_r2': r2_score(y_test, y_pred_xgb_test),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_xgb_train)),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_xgb_test)),
    'train_mae': mean_absolute_error(y_train, y_pred_xgb_train),
    'test_mae': mean_absolute_error(y_test, y_pred_xgb_test),
    'predictions': y_pred_xgb_test,
    'feature_importance': dict(zip(ml_features, xgb_model.feature_importances_))
}

print(f"      Train RÂ²: {results['XGBoost']['train_r2']:.4f} | Test RÂ²: {results['XGBoost']['test_r2']:.4f}")
print(f"      Test RMSE: {results['XGBoost']['test_rmse']:.4f}")

# ------------------------------
# F. LIGHTGBM
# ------------------------------
print("\n   F. LightGBM")
lgb_model = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    num_leaves=31,
    min_child_samples=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)
lgb_model.fit(X_train, y_train)

y_pred_lgb_train = lgb_model.predict(X_train)
y_pred_lgb_test = lgb_model.predict(X_test)

results['LightGBM'] = {
    'model': lgb_model,
    'train_r2': r2_score(y_train, y_pred_lgb_train),
    'test_r2': r2_score(y_test, y_pred_lgb_test),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_lgb_train)),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_lgb_test)),
    'train_mae': mean_absolute_error(y_train, y_pred_lgb_train),
    'test_mae': mean_absolute_error(y_test, y_pred_lgb_test),
    'predictions': y_pred_lgb_test,
    'feature_importance': dict(zip(ml_features, lgb_model.feature_importances_))
}

print(f"      Train RÂ²: {results['LightGBM']['train_r2']:.4f} | Test RÂ²: {results['LightGBM']['test_r2']:.4f}")
print(f"      Test RMSE: {results['LightGBM']['test_rmse']:.4f}")

# ------------------------------
# G. NEURAL NETWORK
# ------------------------------
print("\n   G. Neural Network (MLP)")
nn = MLPRegressor(
    hidden_layer_sizes=(100, 50, 25),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=1000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.2
)
nn.fit(X_train_scaled, y_train)

y_pred_nn_train = nn.predict(X_train_scaled)
y_pred_nn_test = nn.predict(X_test_scaled)

results['Neural Network'] = {
    'model': nn,
    'train_r2': r2_score(y_train, y_pred_nn_train),
    'test_r2': r2_score(y_test, y_pred_nn_test),
    'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_nn_train)),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_nn_test)),
    'train_mae': mean_absolute_error(y_train, y_pred_nn_train),
    'test_mae': mean_absolute_error(y_test, y_pred_nn_test),
    'predictions': y_pred_nn_test
}

print(f"      Train RÂ²: {results['Neural Network']['train_r2']:.4f} | Test RÂ²: {results['Neural Network']['test_r2']:.4f}")
print(f"      Test RMSE: {results['Neural Network']['test_rmse']:.4f}")

# ============================================================================
# 3. CROSS-VALIDATION
# ============================================================================

print("\n3. Cross-validation (5-fold)...")

cv_results = {}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name in ['OLS', 'Ridge', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM', 'Neural Network']:
    model = results[name]['model']
    
    # Use scaled data for linear models and NN
    if name in ['OLS', 'Ridge', 'Neural Network']:
        X_cv = X_train_scaled
    else:
        X_cv = X_train
    
    cv_scores = cross_val_score(model, X_cv, y_train, cv=kf, scoring='r2')
    cv_results[name] = {
        'mean_cv_r2': cv_scores.mean(),
        'std_cv_r2': cv_scores.std()
    }
    
    print(f"   {name}: {cv_scores.mean():.4f} Â± {cv_scores.std():.4f}")

# ============================================================================
# 4. MODEL COMPARISON TABLE
# ============================================================================

print("\n4. Creating model comparison table...")

comparison_data = []
for name, res in results.items():
    comparison_data.append({
        'Model': name,
        'Train RÂ²': res['train_r2'],
        'Test RÂ²': res['test_r2'],
        'CV RÂ²': cv_results[name]['mean_cv_r2'],
        'Test RMSE': res['test_rmse'],
        'Test MAE': res['test_mae'],
        'Overfit': res['train_r2'] - res['test_r2']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test RÂ²', ascending=False)

print("\n" + "="*90)
print("MODEL COMPARISON")
print("="*90)
print(comparison_df.to_string(index=False))
print("="*90)

# ============================================================================
# 5. VISUALIZATIONS
# ============================================================================

print("\n5. Creating visualizations...")

# ------------------------------
# A. MODEL PERFORMANCE COMPARISON
# ------------------------------

fig_comparison = go.Figure()

models = comparison_df['Model'].tolist()
colors = ['#002A54', '#E30613', '#2ecc71', '#3498db', '#9b59b6', '#f39c12', '#e74c3c']

# Test RÂ²
fig_comparison.add_trace(go.Bar(
    name='Test RÂ²',
    x=models,
    y=comparison_df['Test RÂ²'],
    marker_color=colors,
    text=[f"{x:.3f}" for x in comparison_df['Test RÂ²']],
    textposition='outside'
))

fig_comparison.update_layout(
    title={'text': "Model Performance Comparison (Test RÂ²)", 'x': 0.5, 'font': {'size': 20}},
    xaxis_title="Model",
    yaxis_title="RÂ² Score",
    template='plotly_white',
    height=500,
    yaxis=dict(range=[0, max(comparison_df['Test RÂ²']) * 1.15]),
    showlegend=False
)

fig_comparison.write_html(os.path.join(output_dir, 'ml_model_comparison.html'))
print("   âœ“ Model comparison chart saved")

# ------------------------------
# B. PREDICTED VS ACTUAL (Best Model)
# ------------------------------

best_model_name = comparison_df.iloc[0]['Model']
best_predictions = results[best_model_name]['predictions']

fig_pred_actual = go.Figure()

# Perfect prediction line
min_val = min(y_test.min(), best_predictions.min())
max_val = max(y_test.max(), best_predictions.max())
fig_pred_actual.add_trace(go.Scatter(
    x=[min_val, max_val],
    y=[min_val, max_val],
    mode='lines',
    name='Perfect Prediction',
    line=dict(color='red', dash='dash', width=2)
))

# Actual predictions
fig_pred_actual.add_trace(go.Scatter(
    x=y_test,
    y=best_predictions,
    mode='markers',
    name='Predictions',
    marker=dict(size=8, color='#002A54', opacity=0.6),
    text=[f"Actual: {a:.2f}<br>Predicted: {p:.2f}" for a, p in zip(y_test, best_predictions)],
    hovertemplate='%{text}<extra></extra>'
))

fig_pred_actual.update_layout(
    title={'text': f"Predicted vs Actual ECI - {best_model_name}<br>(Test RÂ² = {results[best_model_name]['test_r2']:.3f})",
           'x': 0.5, 'font': {'size': 18}},
    xaxis_title="Actual ECI",
    yaxis_title="Predicted ECI",
    template='plotly_white',
    height=600,
    showlegend=True
)

fig_pred_actual.write_html(os.path.join(output_dir, 'ml_predicted_vs_actual.html'))
print("   âœ“ Predicted vs Actual chart saved")

# ------------------------------
# C. FEATURE IMPORTANCE (Tree-based models)
# ------------------------------

# Get feature importances from best tree-based model
tree_models = ['Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']
best_tree_model = None
best_tree_r2 = 0

for model_name in tree_models:
    if results[model_name]['test_r2'] > best_tree_r2:
        best_tree_r2 = results[model_name]['test_r2']
        best_tree_model = model_name

if best_tree_model:
    importances = results[best_tree_model]['feature_importance']
    importance_df = pd.DataFrame({
        'Feature': list(importances.keys()),
        'Importance': list(importances.values())
    }).sort_values('Importance', ascending=True)
    
    fig_importance = go.Figure()
    fig_importance.add_trace(go.Bar(
        x=importance_df['Importance'],
        y=importance_df['Feature'],
        orientation='h',
        marker_color='#E30613',
        text=[f"{x:.3f}" for x in importance_df['Importance']],
        textposition='outside'
    ))
    
    fig_importance.update_layout(
        title={'text': f"Feature Importance - {best_tree_model}", 'x': 0.5, 'font': {'size': 20}},
        xaxis_title="Importance",
        yaxis_title="Feature",
        template='plotly_white',
        height=500,
        showlegend=False
    )
    
    fig_importance.write_html(os.path.join(output_dir, 'ml_feature_importance.html'))
    print("   âœ“ Feature importance chart saved")

# ------------------------------
# D. RESIDUAL PLOT (Best Model)
# ------------------------------

residuals = y_test - best_predictions

fig_residuals = go.Figure()

fig_residuals.add_trace(go.Scatter(
    x=best_predictions,
    y=residuals,
    mode='markers',
    marker=dict(size=8, color='#002A54', opacity=0.6),
    text=[f"Predicted: {p:.2f}<br>Residual: {r:.2f}" for p, r in zip(best_predictions, residuals)],
    hovertemplate='%{text}<extra></extra>'
))

fig_residuals.add_hline(y=0, line_dash="dash", line_color="red", line_width=2)

fig_residuals.update_layout(
    title={'text': f"Residual Plot - {best_model_name}", 'x': 0.5, 'font': {'size': 20}},
    xaxis_title="Predicted ECI",
    yaxis_title="Residuals",
    template='plotly_white',
    height=500
)

fig_residuals.write_html(os.path.join(output_dir, 'ml_residuals.html'))
print("   âœ“ Residual plot saved")

# ============================================================================
# 6. SAVE RESULTS
# ============================================================================

print("\n6. Saving results...")

# Save comparison table
comparison_df.to_csv(os.path.join(output_dir, 'ml_model_comparison.csv'), index=False)
print("   âœ“ Model comparison table saved")

# Save predictions
predictions_df = pd.DataFrame({
    'Actual_ECI': y_test,
    f'{best_model_name}_Predicted': best_predictions,
    'Residual': residuals
})
predictions_df.to_csv(os.path.join(output_dir, 'ml_predictions.csv'), index=False)
print("   âœ“ Predictions saved")

print("\n" + "="*70)
print("MACHINE LEARNING ANALYSIS COMPLETE!")
print("="*70)
print(f"\nBest Model: {best_model_name}")
print(f"Test RÂ²: {results[best_model_name]['test_r2']:.4f}")
print(f"Test RMSE: {results[best_model_name]['test_rmse']:.4f}")
print(f"Test MAE: {results[best_model_name]['test_mae']:.4f}")
print("\nOutputs saved to:", output_dir)


MACHINE LEARNING: PREDICTING ECONOMIC COMPLEXITY

1. Preparing ML dataset...
   Dataset: 125 observations
   Features: 5
   Target: Economic Complexity Index
   Target range: [-1.90, 2.55]
   Train: 100 | Test: 25

2. Training models...

   A. OLS Linear Regression (Baseline)
      Train RÂ²: 0.5812 | Test RÂ²: 0.6778
      Test RMSE: 0.5813

   B. Ridge Regression
      Train RÂ²: 0.5812 | Test RÂ²: 0.6764
      Test RMSE: 0.5825

   C. Random Forest
      Train RÂ²: 0.7804 | Test RÂ²: 0.7244
      Test RMSE: 0.5376

   D. Gradient Boosting
      Train RÂ²: 0.9860 | Test RÂ²: 0.5972
      Test RMSE: 0.6499

   E. XGBoost
      Train RÂ²: 0.9778 | Test RÂ²: 0.5789
      Test RMSE: 0.6645

   F. LightGBM
      Train RÂ²: 0.9227 | Test RÂ²: 0.5458
      Test RMSE: 0.6901

   G. Neural Network (MLP)
      Train RÂ²: 0.6992 | Test RÂ²: 0.6409
      Test RMSE: 0.6137

3. Cross-validation (5-fold)...
   OLS: 0.4813 Â± 0.1021
   Ridge: 0.4831 Â± 0.0995
   Random Forest: 0.4781 Â± 0.1499
   G