In [2]:
"""
COMPREHENSIVE NATURAL RESOURCE & CLUSTERING ANALYSIS
=====================================================
Part A: Production Maps, Clustering (Train Once, Predict Always), PCA, Temporal Tracking
Part B: Resource Diversity (Shannon Entropy) & Intensity (Production/GDP)
"""

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import os

# ============================================================================
# CONFIGURATION
# ============================================================================

input_file = "/Users/leoss/Desktop/GitHub/Capstone/MASTER/Master.csv"
production_file = "/Users/leoss/Desktop/GitHub/Capstone/MASTER/NaturalResource.csv"
output_dir = "/Users/leoss/Desktop/Portfolio/Website-/capstone_visualizations/individual_plots/cluster"

os.makedirs(output_dir, exist_ok=True)

REFERENCE_YEAR = 2019
N_CLUSTERS = 6
MARKET_SHARE_THRESHOLD = 0.5  # % — ignore resources where country has <0.5% global share

# ============================================================================
# UNIFIED STYLE (single source of truth for all charts)
# ============================================================================

STYLE = {
    # Typography — matches site: IBM Plex Sans body, Syne headings
    # (Plotly can't load Google Fonts, so we set IBM Plex Sans with fallbacks;
    #  titles use the same since Syne won't render in Plotly anyway)
    'font_family': 'IBM Plex Sans, -apple-system, BlinkMacSystemFont, sans-serif',
    'title_size': 18,
    'subtitle_size': 13,
    'axis_title_size': 13,
    'tick_size': 11,
    'legend_size': 12,
    'annotation_size': 11,
    'title_color': '#1a2744',       # --navy

    # Layout
    'template': 'plotly_white',
    'bg_color': 'rgba(0,0,0,0)',    # transparent so site background shows through
    'plot_bg': 'rgba(0,0,0,0)',
    'chart_height': 650,
    'chart_height_small': 450,
    'chart_height_tall': 700,
    'margin': dict(l=60, r=50, t=30, b=80),
    'margin_map': dict(l=10, r=10, t=10, b=80),
    'margin_bar': dict(l=160, r=60, t=30, b=50),

    # Geo (all maps) — derived from site neutrals
    'geo': dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor='#c9cfd6',    # --grey-300
        projection_type='natural earth',
        bgcolor='rgba(0,0,0,0)',
        showland=True,
        landcolor='#f0f2f5',         # --grey-100
        showcountries=True,
        countrycolor='#dde1e7',      # --border
        countrywidth=0.5,
    ),

    # Choropleth lines
    'choropleth_line_color': '#c9cfd6',  # --grey-300
    'choropleth_line_width': 0.5,

    # Colorbar
    'colorbar': dict(len=0.7, thickness=15),

    # Cluster palette — 6 colors anchored to the site's navy/red/green,
    # extended with muted tones that sit well on --grey-100 backgrounds
    'cluster_colors': [
        '#c23a3a',   # accent red   — Petrostates
        '#2e7d4a',   # success grn  — Low-Income Diversified
        '#4a6fa5',   # steel blue   — Advanced Economies
        '#d4853b',   # warm amber   — Coal & Metals Outlier
        '#3d4f5f',   # slate        — Mining-Dependent
        '#8b5c3c',   # warm brown   — Wealthy Hydrocarbon Exporters
    ],

    # Category palette (diversity decomposition) — same family
    'category_colors': {
        'Fossil fuels':        '#3d4f5f',   # slate
        'Base metals':         '#7a8b99',   # cool gray
        'Precious metals':     '#d4853b',   # amber
        'Battery/strategic':   '#2e7d4a',   # green
        'Industrial minerals': '#c23a3a',   # red
    },

    # Positive/negative (loadings bars) — site success/error
    'pos_color': '#2e7d4a',   # --success
    'neg_color': '#c23a3a',   # --accent (error-adjacent)

    # Scatter defaults
    'marker_size': 10,
    'marker_line': dict(width=0.5, color='white'),
}

# Plotly write config (shared across all .write_html calls)
WRITE_CONFIG = {'displayModeBar': False}


# --- Helper functions -------------------------------------------------------

def styled_title(main: str = None, sub: str = None) -> dict:
    """Titles are handled in the HTML page, not in charts."""
    return dict(text='', x=0.5)


def base_layout(**overrides) -> dict:
    layout = dict(
        template=STYLE['template'],
        font=dict(family=STYLE['font_family'], size=STYLE['tick_size'],
                  color='#4b5563'),  # --text-secondary
        paper_bgcolor=STYLE['bg_color'],
        plot_bgcolor=STYLE['plot_bg'],
        height=STYLE['chart_height'],
        margin=STYLE['margin'],
        hoverlabel=dict(
            bgcolor='white',
            bordercolor='#dde1e7',       # --border
            font=dict(
                family=STYLE['font_family'],
                size=13,
                color='#1a2744',          # --navy
            ),
        ),
    )
    layout.update(overrides)
    return layout


def styled_axis(title_text: str) -> dict:
    return dict(
        title=dict(text=title_text,
                   font=dict(size=STYLE['axis_title_size'], family=STYLE['font_family'])),
        tickfont=dict(size=STYLE['tick_size'], family=STYLE['font_family']),
    )


def styled_colorbar(title_text: str) -> dict:
    return dict(
        title=dict(text=title_text,
                   font=dict(size=STYLE['tick_size'], family=STYLE['font_family'])),
        tickfont=dict(size=STYLE['tick_size'], family=STYLE['font_family']),
        **STYLE['colorbar'],
    )


def geo_layout() -> dict:
    return dict(geo=STYLE['geo'])


def legend_horizontal(y_offset: float = -0.15) -> dict:
    return dict(legend=dict(
        title_font=dict(size=STYLE['legend_size'], family=STYLE['font_family']),
        font=dict(size=STYLE['legend_size'], family=STYLE['font_family']),
        orientation='h', yanchor='top', y=y_offset, xanchor='center', x=0.5,
        bgcolor='rgba(255,255,255,0.85)',
    ))


def legend_map() -> dict:
    return dict(legend=dict(
        title='Cluster',
        title_font=dict(size=STYLE['legend_size'], family=STYLE['font_family']),
        font=dict(size=STYLE['legend_size'], family=STYLE['font_family']),
        x=0.99, y=0.99, xanchor='right', yanchor='top',
        bgcolor='rgba(255,255,255,0.9)',
    ))


def save_html(fig, filename: str, config=None):
    fig.write_html(
        os.path.join(output_dir, filename),
        config=config or WRITE_CONFIG,
        include_plotlyjs='cdn',
    )
    print(f"   ✓ {filename}")


def slider_font() -> dict:
    return dict(size=STYLE['tick_size'], family=STYLE['font_family'])


def slider_current_value() -> dict:
    return dict(
        prefix='Year: ', visible=True, xanchor='center',
        font=dict(size=STYLE['subtitle_size'], color=STYLE['title_color'],
                  family=STYLE['font_family']),
    )


# ============================================================================
#  PART A — PRODUCTION MAPS + CLUSTERING + PCA
# ============================================================================

print("=" * 70)
print("PART A: PRODUCTION MAPS + CLUSTERING + PCA")
print("=" * 70)

# ---------- A1. DATA PREPARATION -------------------------------------------

print("\nA1. Loading and preparing data...")

df_prod = pd.read_csv(production_file)
df_master = pd.read_csv(input_file)

print(f"   Production data: {len(df_prod)} rows")
print(f"   Master data: {len(df_master)} rows")


def categorize_resource(resource):
    if resource == 'Oil':
        return 'Oil'
    elif resource == 'Natural Gas':
        return 'Natural Gas'
    elif resource == 'Coal':
        return 'Coal'
    else:
        return 'Metals'


df_prod['Resource_Category'] = df_prod['Resource'].apply(categorize_resource)

prod_agg = df_prod.groupby(
    ['Country Name', 'Year', 'Resource_Category']
)['Production_TotalValue'].sum().reset_index()

prod_wide = prod_agg.pivot_table(
    index=['Country Name', 'Year'],
    columns='Resource_Category',
    values='Production_TotalValue',
    fill_value=0,
).reset_index()

resource_cols = [c for c in prod_wide.columns if c not in ['Country Name', 'Year']]
prod_wide['Total'] = prod_wide[resource_cols].sum(axis=1)

map_data = prod_wide.merge(df_master.copy(), on=['Country Name', 'Year'], how='inner')
map_data['GDP_total'] = (
    map_data['GDP per capita (constant prices, PPP)'] * map_data['Population']
)

for res in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
    if res in map_data.columns:
        map_data[f'{res}_Per_Capita'] = map_data[res] / map_data['Population']
        map_data[f'{res}_GDP_Norm'] = (map_data[res] / map_data['GDP_total']) * 100

print(f"   Merged: {len(map_data)} country-years, {map_data['Country Code'].nunique()} countries")
print(f"   Years: {map_data['Year'].min()} – {map_data['Year'].max()}")

# ---------- A2. PRODUCTION MAP WITH SYNCED DROPDOWNS -----------------------

print("\nA2. Creating production map...")

initial_data = map_data[map_data['Year'] == 2019]
traces = []

for resource in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
    for suffix, norm_type, units in [('', 'absolute', 'USD'),
                                     ('_Per_Capita', 'per_capita', 'USD/person'),
                                     ('_GDP_Norm', 'gdp_norm', '% GDP')]:
        col = f'{resource}{suffix}'
        z = initial_data[col].fillna(0)

        if norm_type == 'absolute':
            hover = [f"${v/1e9:.2f}B" if v >= 1e9
                     else f"${v/1e6:.1f}M" if v >= 1e6
                     else f"${v:,.0f}" for v in z]
        elif norm_type == 'per_capita':
            hover = [f"${v:,.0f}" for v in z]
        else:
            hover = [f"{v:.2f}%" for v in z]

        traces.append(go.Choropleth(
            locations=initial_data['Country Code'],
            z=z,
            text=initial_data['Country Name'],
            customdata=hover,
            colorscale='YlOrRd',
            marker=dict(line=dict(color=STYLE['choropleth_line_color'],
                                  width=STYLE['choropleth_line_width'])),
            colorbar=styled_colorbar(units),
            hovertemplate=(f'<b>%{{text}}</b><br>{resource}: '
                           f'%{{customdata}}<extra></extra>'),
            visible=False,
        ))

traces[0].visible = True

# Slider steps
slider_steps = []
for year in sorted(map_data['Year'].unique()):
    year_data = map_data[map_data['Year'] == year]
    z_list, hover_list = [], []

    for resource in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
        for suffix, norm_type in [('', 'absolute'),
                                  ('_Per_Capita', 'per_capita'),
                                  ('_GDP_Norm', 'gdp_norm')]:
            col = f'{resource}{suffix}'
            z = year_data[col].fillna(0)

            if norm_type == 'absolute':
                hover = [f"${v/1e9:.2f}B" if v >= 1e9
                         else f"${v/1e6:.1f}M" if v >= 1e6
                         else f"${v:,.0f}" for v in z]
            elif norm_type == 'per_capita':
                hover = [f"${v:,.0f}" for v in z]
            else:
                hover = [f"{v:.2f}%" for v in z]

            z_list.append(z.tolist())
            hover_list.append(hover)

    slider_steps.append({
        'method': 'restyle',
        'args': [{
            'z': z_list,
            'customdata': hover_list,
            'locations': [year_data['Country Code'].tolist()] * 15,
            'text': [year_data['Country Name'].tolist()] * 15,
        }],
        'label': str(year),
    })

fig_prod = go.Figure(data=traces)

fig_prod.update_layout(
    **base_layout(margin=STYLE['margin_map']),
    **geo_layout(),
    title=styled_title("Natural Resource Production"),
    sliders=[{
        'active': len(slider_steps) - 1,
        'yanchor': 'top', 'xanchor': 'left',
        'currentvalue': slider_current_value(),
        'pad': {'b': 10, 't': 50},
        'len': 0.9, 'x': 0.05, 'y': 0,
        'steps': slider_steps,
        'transition': {'duration': 0},
        'font': slider_font(),
    }],
)

# Custom HTML controls
fig_html = fig_prod.to_html(include_plotlyjs='cdn', config={'displayModeBar': False})

controls_html = f"""
<div style="position:fixed;top:20px;left:20px;z-index:1000;background:white;
padding:15px;border-radius:8px;box-shadow:0 2px 10px rgba(0,0,0,0.06);
border:1px solid #dde1e7;font-family:{STYLE['font_family']};">
  <label style="font-weight:600;color:{STYLE['title_color']};margin-right:10px;
  font-size:14px;">Resource:</label>
  <select id="resourceSelect" style="padding:8px;border:1px solid #dde1e7;
  border-radius:4px;font-size:14px;font-family:{STYLE['font_family']};">
    <option value="0">Total</option><option value="1">Oil</option>
    <option value="2">Natural Gas</option><option value="3">Coal</option>
    <option value="4">Metals</option>
  </select>
</div>
<div style="position:fixed;top:20px;right:20px;z-index:1000;background:white;
padding:15px;border-radius:8px;box-shadow:0 2px 10px rgba(0,0,0,0.06);
border:1px solid #dde1e7;font-family:{STYLE['font_family']};">
  <label style="font-weight:600;color:{STYLE['title_color']};margin-right:10px;
  font-size:14px;">View:</label>
  <select id="normSelect" style="padding:8px;border:1px solid #dde1e7;
  border-radius:4px;font-size:14px;font-family:{STYLE['font_family']};">
    <option value="0">Absolute</option><option value="1">Per Capita</option>
    <option value="2">% of GDP</option>
  </select>
</div>
<script>
let currentResource=0,currentNorm=0;
function updateMap(){{
  const vis=Array(15).fill(false);
  vis[currentResource*3+currentNorm]=true;
  const p=document.getElementsByClassName('plotly-graph-div')[0];
  if(p)Plotly.restyle(p,{{visible:vis}});
}}
setTimeout(function(){{
  document.getElementById('resourceSelect').addEventListener('change',function(){{
    currentResource=parseInt(this.value);updateMap();}});
  document.getElementById('normSelect').addEventListener('change',function(){{
    currentNorm=parseInt(this.value);updateMap();}});
}},100);
</script>
"""

fig_html = fig_html.replace('<body>', '<body>' + controls_html)
with open(os.path.join(output_dir, 'map_production.html'), 'w') as f:
    f.write(fig_html)
print("   ✓ map_production.html")

# ---------- A3. CLUSTERING — TRAIN ONCE ON REFERENCE YEAR ------------------

print("\nA3. Training clustering model on reference year (2019)...")

feature_cols = [
    'Metals_GDP_Norm', 'Oil_GDP_Norm', 'Natural Gas_GDP_Norm',
    'Coal_GDP_Norm', 'Economic Complexity Index', 'Human capital index',
]

ref_data = map_data[map_data['Year'] == REFERENCE_YEAR].copy()
ref_clean = ref_data[['Country Code', 'Country Name'] + feature_cols].dropna()
print(f"   Reference year countries: {len(ref_clean)}")

scaler_ref = StandardScaler()
X_ref_scaled = scaler_ref.fit_transform(ref_clean[feature_cols])

kmeans_ref = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=50, max_iter=500)
kmeans_ref.fit(X_ref_scaled)
ref_clean['Cluster'] = kmeans_ref.labels_

sil_score = silhouette_score(X_ref_scaled, kmeans_ref.labels_)
print(f"   Silhouette score (2019): {sil_score:.3f}")

# ---------- A4. CLUSTER NAMING ---------------------------------------------

print("\nA4. Analyzing cluster profiles...")

centroids_scaled = kmeans_ref.cluster_centers_
centroids_original = scaler_ref.inverse_transform(centroids_scaled)
centroids_df = pd.DataFrame(centroids_original, columns=feature_cols)
centroids_df['Cluster'] = range(N_CLUSTERS)

print("\n   Cluster Centroids (original scale):")
print(centroids_df.round(3).to_string())

cluster_names = {
    0: 'Petrostates',
    1: 'Low-Income Diversified',
    2: 'Advanced Economies',
    3: 'Coal & Metals Outlier',
    4: 'Mining-Dependent',
    5: 'Wealthy Hydrocarbon Exporters',
}

print("\n   Cluster Names:")
for c, name in sorted(cluster_names.items()):
    count = (ref_clean['Cluster'] == c).sum()
    print(f"   Cluster {c}: {name} ({count} countries)")

print("\n   Sample countries per cluster:")
for c in range(N_CLUSTERS):
    countries = ref_clean[ref_clean['Cluster'] == c]['Country Name'].head(5).tolist()
    print(f"   Cluster {c} ({cluster_names[c]}): {', '.join(countries)}")

# ---------- A5. PCA ANALYSIS -----------------------------------------------

print("\nA5. Performing PCA analysis...")

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_ref_scaled)

ref_clean['PC1'] = X_pca[:, 0]
ref_clean['PC2'] = X_pca[:, 1]
ref_clean['Cluster_Name'] = ref_clean['Cluster'].map(cluster_names)

print(f"   Variance explained: PC1={pca.explained_variance_ratio_[0]:.1%}, "
      f"PC2={pca.explained_variance_ratio_[1]:.1%}")
print(f"   Total: {pca.explained_variance_ratio_.sum():.1%}")

loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=feature_cols)
print("\n   PCA Loadings:")
print(loadings.round(3).to_string())

# ---------- A6. PCA SCATTER PLOT --------------------------------------------

print("\nA6. Creating PCA scatter plot...")

colors = STYLE['cluster_colors']
fig_pca = go.Figure()

for cid in range(N_CLUSTERS):
    cd = ref_clean[ref_clean['Cluster'] == cid]
    fig_pca.add_trace(go.Scatter(
        x=cd['PC1'], y=cd['PC2'],
        mode='markers',
        name=cluster_names[cid],
        marker=dict(size=STYLE['marker_size'], color=colors[cid],
                    line=dict(width=1, color='white')),
        hovertemplate=('<b>%{customdata}</b><br>PC1: %{x:.2f}<br>'
                       'PC2: %{y:.2f}<extra>' + cluster_names[cid] + '</extra>'),
        customdata=cd['Country Name'],
    ))

scale_factor = 3
label_map = {
    'Metals_GDP_Norm': 'Metals', 'Oil_GDP_Norm': 'Oil',
    'Natural Gas_GDP_Norm': 'Gas', 'Coal_GDP_Norm': 'Coal',
    'Economic Complexity Index': 'ECI', 'Human capital index': 'HCI',
}

for feature in feature_cols:
    fig_pca.add_annotation(
        x=loadings.loc[feature, 'PC1'] * scale_factor,
        y=loadings.loc[feature, 'PC2'] * scale_factor,
        ax=0, ay=0, xref='x', yref='y', axref='x', ayref='y',
        showarrow=True, arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor='#1a2744',
    )
    fig_pca.add_annotation(
        x=loadings.loc[feature, 'PC1'] * scale_factor * 1.15,
        y=loadings.loc[feature, 'PC2'] * scale_factor * 1.15,
        text=f"<b>{label_map.get(feature, feature)}</b>",
        showarrow=False,
        font=dict(size=STYLE['annotation_size'], color='#1a2744',
                  family=STYLE['font_family']),
        bgcolor='rgba(255,255,255,0.85)', bordercolor='#c9cfd6',
        borderwidth=1, borderpad=3,
    )

fig_pca.update_layout(
    **base_layout(),
    title=styled_title(
        'PCA: Countries by Resource & Development Profile',
        f'PC1 ({pca.explained_variance_ratio_[0]:.0%}) vs PC2 '
        f'({pca.explained_variance_ratio_[1]:.0%}) — Arrows show feature loadings',
    ),
    xaxis=styled_axis(f'PC1: Economic Development '
                      f'({pca.explained_variance_ratio_[0]:.0%} variance)'),
    yaxis=styled_axis(f'PC2: Hydrocarbon Intensity '
                      f'({pca.explained_variance_ratio_[1]:.0%} variance)'),
    **legend_horizontal(-0.15),
)

save_html(fig_pca, 'pca_scatter_clusters.html')

# ---------- A7. PCA LOADINGS BAR CHARTS ------------------------------------

print("\nA7. Creating PCA loadings charts...")

feature_labels_clean = {
    'Metals_GDP_Norm': 'Metals (% GDP)',
    'Oil_GDP_Norm': 'Oil (% GDP)',
    'Natural Gas_GDP_Norm': 'Natural Gas (% GDP)',
    'Coal_GDP_Norm': 'Coal (% GDP)',
    'Economic Complexity Index': 'Economic Complexity',
    'Human capital index': 'Human Capital',
}

for pc_col, subtitle in [
    ('PC1', 'Positive = Higher Development'),
    ('PC2', 'Positive = Higher Hydrocarbon Intensity'),
]:
    loadings_sorted = loadings[pc_col].sort_values()
    var_pct = pca.explained_variance_ratio_[0 if pc_col == 'PC1' else 1]

    fig_bar = go.Figure()
    fig_bar.add_trace(go.Bar(
        y=[feature_labels_clean.get(f, f) for f in loadings_sorted.index],
        x=loadings_sorted.values,
        orientation='h',
        marker_color=[STYLE['pos_color'] if x > 0 else STYLE['neg_color']
                      for x in loadings_sorted],
        text=[f"{x:.3f}" for x in loadings_sorted.values],
        textposition='outside',
        textfont=dict(size=STYLE['tick_size'], family=STYLE['font_family']),
    ))
    fig_bar.add_vline(x=0, line_dash='dash', line_color='#c9cfd6')

    fig_bar.update_layout(
        **base_layout(height=STYLE['chart_height_small'], margin=STYLE['margin_bar']),
        title=styled_title(f"{pc_col} Loadings ({var_pct:.1%} variance)", subtitle),
        xaxis=styled_axis("Loading"),
        yaxis=dict(tickfont=dict(size=STYLE['tick_size'], family=STYLE['font_family'])),
    )

    save_html(fig_bar, f'pca_loadings_{pc_col.lower()}.html')

# ---------- A8. APPLY CLUSTERING TO ALL YEARS -------------------------------

print("\nA8. Applying model to all years...")

all_years_results = []

for year in sorted(map_data['Year'].unique()):
    yd = map_data[map_data['Year'] == year].copy()
    yc = yd[['Country Code', 'Country Name', 'Year'] + feature_cols].dropna()
    if len(yc) == 0:
        continue

    X_ys = scaler_ref.transform(yc[feature_cols])
    yc['Cluster'] = kmeans_ref.predict(X_ys)
    yc['Cluster_Name'] = yc['Cluster'].map(cluster_names)

    X_yp = pca.transform(X_ys)
    yc['PC1'] = X_yp[:, 0]
    yc['PC2'] = X_yp[:, 1]

    all_years_results.append(yc)

    if year in [1995, 2000, 2005, 2010, 2015, 2019]:
        dist = yc['Cluster'].value_counts().sort_index()
        print(f"   {year}: {dict(dist)}")

temporal_data = pd.concat(all_years_results, ignore_index=True)
print(f"\n   Total observations: {len(temporal_data)}")

# ---------- A9. TEMPORAL MOVEMENT ANALYSIS ----------------------------------

print("\nA9. Analyzing country movements over time...")

first_last = temporal_data.groupby('Country Code').agg({
    'Year': ['min', 'max'], 'Country Name': 'first',
}).reset_index()
first_last.columns = ['Country Code', 'First_Year', 'Last_Year', 'Country Name']

movements = []
for _, row in first_last.iterrows():
    cc = row['Country Code']
    fo = temporal_data[(temporal_data['Country Code'] == cc) &
                       (temporal_data['Year'] == row['First_Year'])]
    lo = temporal_data[(temporal_data['Country Code'] == cc) &
                       (temporal_data['Year'] == row['Last_Year'])]
    if len(fo) > 0 and len(lo) > 0:
        movements.append({
            'Country Code': cc,
            'Country Name': row['Country Name'],
            'First_Year': row['First_Year'],
            'Last_Year': row['Last_Year'],
            'Initial_Cluster': fo.iloc[0]['Cluster'],
            'Final_Cluster': lo.iloc[0]['Cluster'],
            'Initial_Cluster_Name': cluster_names[fo.iloc[0]['Cluster']],
            'Final_Cluster_Name': cluster_names[lo.iloc[0]['Cluster']],
        })

movements_df = pd.DataFrame(movements)
movements_df['Changed'] = movements_df['Initial_Cluster'] != movements_df['Final_Cluster']

print(f"   Countries that changed clusters: "
      f"{movements_df['Changed'].sum()} / {len(movements_df)}")

print("\n   Notable Transitions:")
changers = movements_df[movements_df['Changed']].copy()
for _, row in changers.head(15).iterrows():
    print(f"   {row['Country Name']}: "
          f"{row['Initial_Cluster_Name']} → {row['Final_Cluster_Name']}")

# ---------- A10. SANKEY DIAGRAM ---------------------------------------------

print("\nA10. Creating Sankey diagram...")

transition_matrix = pd.crosstab(
    movements_df['Initial_Cluster_Name'],
    movements_df['Final_Cluster_Name'],
)

all_clusters = list(cluster_names.values())
source_labels = [f"{c}  ›" for c in all_clusters]
target_labels = [f"›  {c}" for c in all_clusters]
all_labels = source_labels + target_labels

source_indices, target_indices, values, link_colors = [], [], [], []
color_map = {name: colors[i] for i, name in enumerate(cluster_names.values())}

for i, src in enumerate(all_clusters):
    for j, tgt in enumerate(all_clusters):
        if src in transition_matrix.index and tgt in transition_matrix.columns:
            val = transition_matrix.loc[src, tgt]
            if val > 0:
                source_indices.append(i)
                target_indices.append(len(all_clusters) + j)
                values.append(val)
                base_color = color_map[src]
                alpha = 0.8 if src == tgt else 0.4
                rgb = tuple(int(base_color.lstrip('#')[k:k+2], 16) for k in (0, 2, 4))
                link_colors.append(f'rgba({rgb[0]},{rgb[1]},{rgb[2]},{alpha})')

fig_sankey = go.Figure(data=[go.Sankey(
    node=dict(
        pad=25, thickness=24,
        line=dict(color='#c9cfd6', width=0.5),
        label=all_labels,
        color=[color_map.get(l.replace('  ›', '').replace('›  ', ''), '#5a6675')
               for l in all_labels],
    ),
    link=dict(source=source_indices, target=target_indices,
              value=values, color=link_colors),
    textfont=dict(size=13, family=STYLE['font_family'], color='#1a2744'),
)])

fig_sankey.update_layout(
    **base_layout(),
    title=styled_title(
        f'Country Cluster Transitions '
        f'({movements_df["First_Year"].min()}–{movements_df["Last_Year"].max()})'),
)

save_html(fig_sankey, 'cluster_sankey_transitions.html')

# ---------- A11. CLUSTER MAP (STATIC, REFERENCE YEAR) ----------------------

print("\nA11. Creating cluster map...")

ref_clean_named = ref_clean.copy()
ref_clean_named['Cluster_Name'] = ref_clean_named['Cluster'].map(cluster_names)

fig_cmap = go.Figure()
for cid in range(N_CLUSTERS):
    dc = ref_clean_named[ref_clean_named['Cluster'] == cid]
    fig_cmap.add_trace(go.Choropleth(
        locations=dc['Country Code'],
        z=[cid] * len(dc),
        text=dc['Country Name'],
        name=cluster_names[cid],
        colorscale=[[0, colors[cid]], [1, colors[cid]]],
        zmin=0, zmax=N_CLUSTERS - 1,
        showscale=False,
        marker=dict(line=dict(color='white', width=0.5)),
        hovertemplate='<b>%{text}</b><br>' + cluster_names[cid] + '<extra></extra>',
    ))

fig_cmap.update_layout(
    **base_layout(margin=STYLE['margin_map']),
    **geo_layout(),
    **legend_map(),
    title=styled_title(f'Country Clusters ({REFERENCE_YEAR})'),
)

save_html(fig_cmap, 'map_clusters.html')

# ---------- A12. ANIMATED CLUSTER MAP --------------------------------------

print("\nA12. Creating animated cluster map...")

years = sorted(temporal_data['Year'].unique())
frames = []

for year in years:
    yd = temporal_data[temporal_data['Year'] == year]
    fd = []
    for cid in range(N_CLUSTERS):
        dc = yd[yd['Cluster'] == cid]
        fd.append(go.Choropleth(
            locations=dc['Country Code'],
            z=[cid] * len(dc),
            text=dc['Country Name'],
            colorscale=[[0, colors[cid]], [1, colors[cid]]],
            zmin=0, zmax=N_CLUSTERS - 1,
            showscale=False,
            marker=dict(line=dict(color='white', width=0.5)),
            hovertemplate='<b>%{text}</b><br>' + cluster_names[cid] + '<extra></extra>',
        ))
    frames.append(go.Frame(data=fd, name=str(year)))

init_anim = temporal_data[temporal_data['Year'] == years[0]]
fig_anim = go.Figure()

for cid in range(N_CLUSTERS):
    dc = init_anim[init_anim['Cluster'] == cid]
    fig_anim.add_trace(go.Choropleth(
        locations=dc['Country Code'],
        z=[cid] * len(dc),
        text=dc['Country Name'],
        name=cluster_names[cid],
        colorscale=[[0, colors[cid]], [1, colors[cid]]],
        zmin=0, zmax=N_CLUSTERS - 1,
        showscale=False,
        marker=dict(line=dict(color='white', width=0.5)),
        hovertemplate='<b>%{text}</b><br>' + cluster_names[cid] + '<extra></extra>',
    ))

fig_anim.frames = frames

fig_anim.update_layout(
    **base_layout(margin=STYLE['margin_map']),
    **geo_layout(),
    **legend_map(),
    title=styled_title('Cluster Evolution Over Time'),
    updatemenus=[{
        'type': 'buttons', 'showactive': False,
        'y': 0, 'x': 0.1, 'xanchor': 'right', 'yanchor': 'top',
        'font': slider_font(),
        'buttons': [
            {'label': '▶ Play', 'method': 'animate',
             'args': [None, {'frame': {'duration': 500, 'redraw': True},
                             'fromcurrent': True,
                             'transition': {'duration': 200}}]},
            {'label': '⏸ Pause', 'method': 'animate',
             'args': [[None], {'frame': {'duration': 0, 'redraw': False},
                               'mode': 'immediate',
                               'transition': {'duration': 0}}]},
        ],
    }],
    sliders=[{
        'active': 0, 'yanchor': 'top', 'xanchor': 'left',
        'currentvalue': slider_current_value(),
        'pad': {'b': 10, 't': 50}, 'len': 0.9, 'x': 0.05, 'y': 0,
        'font': slider_font(),
        'steps': [{
            'args': [[str(y)], {'frame': {'duration': 0, 'redraw': True},
                                'mode': 'immediate',
                                'transition': {'duration': 0}}],
            'label': str(y), 'method': 'animate',
        } for y in years],
    }],
)

save_html(fig_anim, 'cluster_map_animated.html')


# ============================================================================
#  PART B — RESOURCE DIVERSITY & INTENSITY
# ============================================================================

print("\n" + "=" * 70)
print("PART B: RESOURCE DIVERSITY & INTENSITY")
print("=" * 70)

CATEGORY_MAP = {
    'Oil': 'Fossil fuels', 'Natural Gas': 'Fossil fuels', 'Coal': 'Fossil fuels',
    'Copper': 'Base metals', 'Lead': 'Base metals', 'Zinc': 'Base metals',
    'Tin': 'Base metals', 'Nickel': 'Base metals', 'Aluminium': 'Base metals',
    'Gold': 'Precious metals', 'Silver': 'Precious metals',
    'Lithium': 'Battery/strategic', 'Cobalt': 'Battery/strategic',
    'Rare Earth': 'Battery/strategic', 'Vanadium': 'Battery/strategic',
    'Bauxite': 'Industrial minerals', 'Manganese': 'Industrial minerals',
    'Magnesium compounds': 'Industrial minerals', 'Cadmium': 'Industrial minerals',
    'Natural Graphite': 'Industrial minerals',
}

CATEGORY_ORDER = ['Fossil fuels', 'Base metals', 'Precious metals',
                  'Battery/strategic', 'Industrial minerals']

# ---------- B1. MARKET SHARES & THRESHOLD -----------------------------------

print(f"\nB1. Computing market shares (threshold: >{MARKET_SHARE_THRESHOLD}%)...")

df_year = df_prod[df_prod['Year'] == REFERENCE_YEAR].copy()

global_prod = df_year.groupby('Resource')['Production_TotalValue'].sum().reset_index()
global_prod.columns = ['Resource', 'Global_Production']

country_res = df_year.groupby(
    ['Country Name', 'Resource']
)['Production_TotalValue'].sum().reset_index()
country_res = country_res.merge(global_prod, on='Resource')

country_res['Market_Share_Pct'] = (
    country_res['Production_TotalValue'] / country_res['Global_Production']
) * 100
country_res['Market_Share_Pct'] = country_res['Market_Share_Pct'].replace(
    [np.inf, -np.inf], 0).fillna(0)

n_before = len(country_res)
country_res = country_res[country_res['Market_Share_Pct'] >= MARKET_SHARE_THRESHOLD].copy()
print(f"   Pairs before threshold: {n_before}")
print(f"   Pairs after threshold:  {len(country_res)}")

country_res['Category'] = country_res['Resource'].map(CATEGORY_MAP)
unmapped = country_res[country_res['Category'].isna()]['Resource'].unique()
if len(unmapped) > 0:
    print(f"   WARNING: Unmapped resources: {unmapped}")
country_res = country_res.dropna(subset=['Category'])

# ---------- B2. DIVERSITY (Shannon Entropy) ---------------------------------

print("\nB2. Computing Resource Diversity (Shannon entropy)...")

country_cat = country_res.groupby(
    ['Country Name', 'Category']
)['Production_TotalValue'].sum().reset_index()

country_totals = country_cat.groupby(
    'Country Name'
)['Production_TotalValue'].sum().reset_index()
country_totals.columns = ['Country Name', 'Country_Total']
country_cat = country_cat.merge(country_totals, on='Country Name')

country_cat['Domestic_Share'] = (
    country_cat['Production_TotalValue'] / country_cat['Country_Total']
)
country_cat['Entropy_Component'] = (
    -country_cat['Domestic_Share'] * np.log(country_cat['Domestic_Share'])
)

diversity = country_cat.groupby('Country Name').agg(
    Shannon_Entropy=('Entropy_Component', 'sum'),
    N_Categories=('Category', 'nunique'),
).reset_index()

MAX_ENTROPY = np.log(len(CATEGORY_ORDER))
diversity['Diversity_Normalized'] = diversity['Shannon_Entropy'] / MAX_ENTROPY

print(f"   Max possible entropy: {MAX_ENTROPY:.3f} (= ln({len(CATEGORY_ORDER)}))")
print(f"   Countries with scores: {len(diversity)}")

cat_pivot = country_cat.pivot_table(
    index='Country Name', columns='Category',
    values='Domestic_Share', fill_value=0,
).reset_index()

diversity = diversity.merge(cat_pivot, on='Country Name', how='left')

# ---------- B3. INTENSITY (Production / GDP) --------------------------------

print("\nB3. Computing Resource Intensity (production / GDP)...")

gdp_data = df_master[df_master['Year'] == REFERENCE_YEAR][
    ['Country Name', 'GDP per capita (constant prices, PPP)', 'Population']
].copy()
gdp_data['GDP'] = (
    pd.to_numeric(gdp_data['GDP per capita (constant prices, PPP)'], errors='coerce') *
    pd.to_numeric(gdp_data['Population'], errors='coerce')
)
gdp_data = gdp_data.dropna(subset=['GDP'])
gdp_data = gdp_data[gdp_data['GDP'] > 0][['Country Name', 'GDP']]

country_total_prod = country_res.groupby(
    'Country Name'
)['Production_TotalValue'].sum().reset_index()
country_total_prod.columns = ['Country Name', 'Total_Resource_Value']

intensity = country_total_prod.merge(gdp_data, on='Country Name', how='inner')
intensity['Resource_Intensity_Pct'] = (
    intensity['Total_Resource_Value'] / intensity['GDP']
) * 100

print(f"   Countries with intensity: {len(intensity)}")

# ---------- B4. MERGE -------------------------------------------------------

print("\nB4. Merging features...")

result = diversity.merge(
    intensity[['Country Name', 'Total_Resource_Value', 'GDP', 'Resource_Intensity_Pct']],
    on='Country Name', how='outer',
)

master_codes = df_master[['Country Name', 'Country Code']].drop_duplicates()
result = result.merge(master_codes, on='Country Name', how='left')
result = result[result['Country Name'] != '0']

result['Shannon_Entropy'] = result['Shannon_Entropy'].fillna(0)
result['Diversity_Normalized'] = result['Diversity_Normalized'].fillna(0)
result['Resource_Intensity_Pct'] = result['Resource_Intensity_Pct'].fillna(0)
result['N_Categories'] = result['N_Categories'].fillna(0).astype(int)

print(f"   Final dataset: {len(result)} countries")
print(f"   Diversity range:  {result['Diversity_Normalized'].min():.3f} – "
      f"{result['Diversity_Normalized'].max():.3f}")
print(f"   Intensity range:  {result['Resource_Intensity_Pct'].min():.2f}% – "
      f"{result['Resource_Intensity_Pct'].max():.2f}%")

# ---------- B5. DIVERSITY MAP -----------------------------------------------

print("\nB5. Creating diversity map...")

fig_div = go.Figure()

fig_div.add_trace(go.Choropleth(
    locations=result['Country Code'],
    z=result['Diversity_Normalized'],
    text=result['Country Name'],
    customdata=np.stack([
        result['Shannon_Entropy'],
        result['N_Categories'],
        result['Diversity_Normalized'],
    ], axis=-1),
    colorscale=[[0, '#f0f2f5'], [0.25, '#c9cfd6'], [0.5, '#7a8b99'],
                [0.75, '#4a6fa5'], [1, '#1a2744']],
    marker=dict(line=dict(color='white', width=STYLE['choropleth_line_width'])),
    zmin=0, zmax=1,
    colorbar=styled_colorbar('Normalized<br>Entropy'),
    hovertemplate=(
        '<b>%{text}</b><br>'
        'Diversity: %{customdata[2]:.3f}<br>'
        'Categories: %{customdata[1]:.0f}/5<br>'
        'Raw entropy: %{customdata[0]:.3f}'
        '<extra></extra>'
    ),
))

fig_div.update_layout(
    **base_layout(margin=STYLE['margin_map']),
    **geo_layout(),
    title=styled_title(
        f'Resource Diversity Index ({REFERENCE_YEAR})',
        f'Shannon entropy across 5 resource categories '
        f'(threshold: >{MARKET_SHARE_THRESHOLD}% global market share)',
    ),
)

save_html(fig_div, 'diversity_map.html')

# ---------- B6. INTENSITY MAP ----------------------------------------------

print("\nB6. Creating intensity map...")

result['Intensity_Log'] = np.log1p(result['Resource_Intensity_Pct'])

fig_int = go.Figure()

fig_int.add_trace(go.Choropleth(
    locations=result['Country Code'],
    z=result['Intensity_Log'],
    text=result['Country Name'],
    customdata=result['Resource_Intensity_Pct'],
    colorscale='YlOrRd',
    marker=dict(line=dict(color='white', width=STYLE['choropleth_line_width'])),
    colorbar=styled_colorbar('Intensity<br>(log scale)'),
    hovertemplate=(
        '<b>%{text}</b><br>'
        'Resource production: %{customdata:.1f}% of GDP'
        '<extra></extra>'
    ),
))

fig_int.update_layout(
    **base_layout(margin=STYLE['margin_map']),
    **geo_layout(),
    title=styled_title(
        f'Resource Intensity ({REFERENCE_YEAR})',
        'Total resource production value as % of GDP (log scale for visibility)',
    ),
)

save_html(fig_int, 'intensity_map.html')

# ---------- B7. DECOMPOSED BAR CHART — TOP 20 BY DIVERSITY -----------------

print("\nB7. Creating decomposition chart...")

top20 = result.nlargest(20, 'Diversity_Normalized')
cat_colors = STYLE['category_colors']

fig_decomp = go.Figure()

for cat in CATEGORY_ORDER:
    if cat in top20.columns:
        fig_decomp.add_trace(go.Bar(
            name=cat,
            y=top20['Country Name'],
            x=top20[cat],
            orientation='h',
            marker_color=cat_colors[cat],
            hovertemplate=f'<b>%{{y}}</b><br>{cat}: %{{x:.1%}}<extra></extra>',
        ))

fig_decomp.update_layout(
    **base_layout(height=STYLE['chart_height_tall'], margin=dict(l=160, r=50, t=30, b=50)),
    barmode='stack',
    title=styled_title(
        f'Resource Portfolio Composition ({REFERENCE_YEAR})',
        'Top 20 most diversified countries — share of domestic production by category',
    ),
    xaxis=dict(**styled_axis('Share of domestic resource production'), tickformat='.0%'),
    yaxis=dict(categoryorder='total ascending',
               tickfont=dict(size=STYLE['tick_size'], family=STYLE['font_family'])),
    **legend_horizontal(1.02),
)

save_html(fig_decomp, 'diversity_decomposed.html')

# ---------- B8. SCATTER — DIVERSITY vs INTENSITY ----------------------------

print("\nB8. Creating diversity vs intensity scatter...")

fig_scat = go.Figure()

fig_scat.add_trace(go.Scatter(
    x=result['Diversity_Normalized'],
    y=result['Resource_Intensity_Pct'],
    mode='markers+text',
    text=result['Country Code'],
    textposition='top center',
    textfont=dict(size=8, family=STYLE['font_family']),
    marker=dict(
        size=STYLE['marker_size'],
        color=result['N_Categories'],
        colorscale=[[0, '#c9cfd6'], [0.33, '#7a8b99'],
                    [0.66, '#4a6fa5'], [1, '#1a2744']],
        showscale=True,
        colorbar=styled_colorbar('N categories'),
        line=STYLE['marker_line'],
    ),
    customdata=np.stack([
        result['Country Name'],
        result['N_Categories'],
        result['Resource_Intensity_Pct'],
    ], axis=-1),
    hovertemplate=(
        '<b>%{customdata[0]}</b><br>'
        'Diversity: %{x:.3f}<br>'
        'Intensity: %{customdata[2]}% of GDP<br>'
        'Categories: %{customdata[1]}/5'
        '<extra></extra>'
    ),
))

fig_scat.update_layout(
    **base_layout(),
    title=styled_title(
        f'Resource Diversity vs Intensity ({REFERENCE_YEAR})',
        'Each dot = one country. Color = number of active categories.',
    ),
    xaxis=styled_axis('Resource Diversity (normalized entropy, 0–1)'),
    yaxis=dict(**styled_axis('Resource Intensity (% of GDP, log scale)'), type='log'),
)

save_html(fig_scat, 'diversity_vs_intensity.html')

# ============================================================================
#  SAVE ALL DATA FILES
# ============================================================================

print("\n" + "=" * 70)
print("SAVING DATA FILES")
print("=" * 70)

temporal_data.to_csv(os.path.join(output_dir, 'cluster_assignments_all_years.csv'), index=False)
movements_df.to_csv(os.path.join(output_dir, 'cluster_movements_summary.csv'), index=False)
centroids_df['Cluster_Name'] = centroids_df['Cluster'].map(cluster_names)
centroids_df.to_csv(os.path.join(output_dir, 'cluster_centroids.csv'), index=False)
transition_matrix.to_csv(os.path.join(output_dir, 'cluster_transition_matrix.csv'))
loadings.to_csv(os.path.join(output_dir, 'pca_loadings.csv'))

output_cols = ['Country Name', 'Country Code', 'N_Categories',
               'Shannon_Entropy', 'Diversity_Normalized',
               'Total_Resource_Value', 'GDP', 'Resource_Intensity_Pct']
for cat in CATEGORY_ORDER:
    if cat in result.columns:
        output_cols.append(cat)

result[output_cols].to_csv(
    os.path.join(output_dir, 'diversity_intensity_scores.csv'), index=False
)

print("   ✓ All data files saved")

# ============================================================================
#  COMPREHENSIVE RESULTS SUMMARY
#  (copy-paste this output to update the HTML page)
# ============================================================================

print("\n" + "=" * 70)
print("COMPREHENSIVE RESULTS SUMMARY")
print("=" * 70)

# --- Clustering overview ---
print(f"\n{'—' * 50}")
print("CLUSTERING")
print(f"{'—' * 50}")
print(f"  Reference year: {REFERENCE_YEAR}")
print(f"  Features: {len(feature_cols)}")
for f in feature_cols:
    print(f"    - {f}")
print(f"  Clusters: {N_CLUSTERS}")
print(f"  Silhouette score: {sil_score:.3f}")
print(f"  Countries in reference year: {len(ref_clean)}")

# --- Cluster profiles ---
print(f"\n{'—' * 50}")
print("CLUSTER PROFILES")
print(f"{'—' * 50}")
for c in range(N_CLUSTERS):
    mask = ref_clean['Cluster'] == c
    count = mask.sum()
    countries = ref_clean[mask]['Country Name'].tolist()
    print(f"\n  [{c}] {cluster_names[c]} ({count} countries)")
    print(f"      Examples: {', '.join(countries[:8])}")
    centroid = centroids_df[centroids_df['Cluster'] == c].iloc[0]
    for feat in feature_cols:
        print(f"      {feat}: {centroid[feat]:.3f}")

# --- PCA ---
print(f"\n{'—' * 50}")
print("PCA")
print(f"{'—' * 50}")
print(f"  PC1 variance: {pca.explained_variance_ratio_[0]:.1%}")
print(f"  PC2 variance: {pca.explained_variance_ratio_[1]:.1%}")
print(f"  Total: {pca.explained_variance_ratio_.sum():.1%}")
print(f"\n  Loadings:")
for feat in feature_cols:
    print(f"    {feat:40s}  PC1={loadings.loc[feat, 'PC1']:+.3f}  PC2={loadings.loc[feat, 'PC2']:+.3f}")

# --- Temporal transitions ---
print(f"\n{'—' * 50}")
print("TEMPORAL TRANSITIONS")
print(f"{'—' * 50}")
print(f"  Period: {temporal_data['Year'].min()} – {temporal_data['Year'].max()}")
print(f"  Countries tracked: {temporal_data['Country Code'].nunique()}")
print(f"  Countries that changed cluster: {movements_df['Changed'].sum()} / {len(movements_df)}")
print(f"  Stability rate: {(~movements_df['Changed']).mean():.0%}")

print(f"\n  Transition matrix:")
print(transition_matrix.to_string())

print(f"\n  All transitions:")
for _, row in changers.iterrows():
    print(f"    {row['Country Name']}: {row['Initial_Cluster_Name']} → {row['Final_Cluster_Name']}")

# --- Diversity top lists ---
print(f"\n{'—' * 50}")
print("DIVERSITY INDEX (Shannon entropy, normalized 0–1)")
print(f"{'—' * 50}")
print(f"  Threshold: >{MARKET_SHARE_THRESHOLD}% global market share")
print(f"  Categories: {len(CATEGORY_ORDER)}")
print(f"  Countries scored: {len(result)}")
print(f"  Range: {result['Diversity_Normalized'].min():.3f} – {result['Diversity_Normalized'].max():.3f}")

print(f"\n  Top 15 by diversity:")
for i, (_, row) in enumerate(result.nlargest(15, 'Diversity_Normalized').iterrows(), 1):
    print(f"    {i:2d}. {row['Country Name']:30s} {row['Diversity_Normalized']:.3f}  ({int(row['N_Categories'])}/5 categories)")

# --- Intensity top lists ---
print(f"\n{'—' * 50}")
print("RESOURCE INTENSITY (production value / GDP)")
print(f"{'—' * 50}")
print(f"  Range: {result['Resource_Intensity_Pct'].min():.2f}% – {result['Resource_Intensity_Pct'].max():.2f}%")

print(f"\n  Top 15 by intensity:")
for i, (_, row) in enumerate(result.nlargest(15, 'Resource_Intensity_Pct').iterrows(), 1):
    print(f"    {i:2d}. {row['Country Name']:30s} {row['Resource_Intensity_Pct']:.1f}% of GDP")

# --- Production leaders ---
print(f"\n{'—' * 50}")
print("PRODUCTION LEADERS (2019, absolute)")
print(f"{'—' * 50}")
prod_2019 = map_data[map_data['Year'] == 2019].copy()
for res in ['Total', 'Oil', 'Natural Gas', 'Coal', 'Metals']:
    if res in prod_2019.columns:
        top5 = prod_2019.nlargest(5, res)[['Country Name', res]]
        print(f"\n  {res}:")
        for _, row in top5.iterrows():
            val = row[res]
            label = f"${val/1e9:.1f}B" if val >= 1e9 else f"${val/1e6:.0f}M"
            print(f"    {row['Country Name']:30s} {label}")

print(f"\n{'=' * 70}")
print("END OF SUMMARY")
print("=" * 70)

PART A: PRODUCTION MAPS + CLUSTERING + PCA

A1. Loading and preparing data...
   Production data: 17166 rows
   Master data: 3150 rows
   Merged: 3069 country-years, 126 countries
   Years: 1995 – 2019

A2. Creating production map...
   ✓ map_production.html

A3. Training clustering model on reference year (2019)...
   Reference year countries: 125
   Silhouette score (2019): 0.344

A4. Analyzing cluster profiles...

   Cluster Centroids (original scale):
   Metals_GDP_Norm  Oil_GDP_Norm  Natural Gas_GDP_Norm  Coal_GDP_Norm  Economic Complexity Index  Human capital index  Cluster
0            0.121         8.277                 0.493          0.001                     -0.729                2.497        0
1            0.526         0.404                 0.119          0.094                     -0.786                2.123        1
2            0.376         0.301                 0.122          0.195                      0.789                3.225        2
3            5.792         0.000