In [100]:
import numpy as np
import pandas as pd
from pathlib import Path
#Import plotting means 
import matplotlib.pyplot as plt
import seaborn as sns

In [101]:
import plotly.graph_objects as go


In [None]:
master = pd.read_csv("C:/Users/emili/OneDrive/Documentos/LSE MPA DSPP/AT 2025/Capstone/Master Data/Master.csv")

In [None]:
master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 49 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   Unnamed: 0                                                           3150 non-null   int64  
 1   Country Code                                                         3150 non-null   object 
 2   Country Name                                                         3150 non-null   object 
 3   Year                                                                 3150 non-null   int64  
 4   Access to electricity (% of population)                              3150 non-null   float64
 5   Adjusted savings: gross savings (% of GNI)                           3150 non-null   float64
 6   Agriculture                                                          3150 non-null   float64
 7   Capita

In [None]:
master.columns

Index(['Unnamed: 0', 'Country Code', 'Country Name', 'Year',
       'Access to electricity (% of population)',
       'Adjusted savings: gross savings (% of GNI)', 'Agriculture',
       'Capital depreciation rate', 'Clientelism index',
       'Death rates, crude per 1000 people',
       'Domestic credit to private sector (% of GDP)',
       'Economic Complexity Index', 'GDP per capita (constant prices, PPP)',
       'Government revenue',
       'Gross fixed capital formation, all, Constant prices, Percent of GDP',
       'Human capital index', 'Industry',
       'Inflation, consumer prices (annual %)', 'Landlocked',
       'Lending interest rate (%)', 'Life expectancy at birth, total (years)',
       'Manufacturing', 'Mineral rents (% of GDP)',
       'Mobile cellular subscriptions (per 100 people)',
       'Natural gas rents (% of GDP)', 'Oil rents (% of GDP)',
       'Political corruption index', 'Political stability — estimate',
       'Primary net lending, General government, Perce

In [None]:
country_number = master['Country Name'].nunique()
year_number = master['Year'].nunique()

In [None]:
f'We have a master data set with {country_number} countries and {year_number} years'

'We have a master data set with 126 countries and 25 years'

In [None]:
master['Income1995'].unique()

array(['Low-income countries', 'High-income countries',
       'Upper-middle-income countries', 'Lower-middle-income countries',
       nan], dtype=object)

In [None]:
master['Country Name'][master['Income1995'] == 'High-income countries'].unique()

array(['United Arab Emirates', 'Australia', 'Austria',
       'Brunei Darussalam', 'Canada', 'Switzerland', 'Cyprus', 'Germany',
       'Denmark', 'Spain', 'Finland', 'France', 'United Kingdom',
       'Ireland', 'Israel', 'Italy', 'Japan', 'Korea, Rep.', 'Kuwait',
       'Netherlands', 'Norway', 'New Zealand', 'Portugal', 'Qatar',
       'Sweden', 'United States'], dtype=object)

UAE, Kuwait, Qatar

In [None]:
master['Country Name'][master['Income1995'] == 'Upper-middle-income countries'].unique()

array(['Argentina', 'Bahrain', 'Brazil', 'Chile', 'Czechia', 'Gabon',
       'Greece', 'Croatia', 'Hungary', 'Libya', 'Mexico', 'Malaysia',
       'Oman', 'Saudi Arabia', 'Slovenia', 'Trinidad and Tobago',
       'Uruguay', 'South Africa'], dtype=object)

Thematic Grouping

In [None]:
variable_groups = {
    'identifiers': [
        'Country Code', 'Country Name', 'Year'
    ],

        'economic_complexity_index': [
        'Economic Complexity Index'
    ],
    
    'resource_dependence': [
        'Oil rents (% of GDP)',
        'Natural gas rents (% of GDP)',
        'Mineral rents (% of GDP)',
        'Total natural resources rents (% of GDP)',
        'Total_Production',
        'Total_Reserves',
        'Total_Production_Value',
        'Total_Reserves_Value',
        'Hydrocarbons_Dominant',
        'Subsoil_Metals_Dominant',
        'Precious_Metals_Dominant'
    ],
    
    'economic_structure': [
        'Trade (% of GDP)',
        'Agriculture',
        'Industry',
        'Manufacturing',
        'Services'
    ],
    
    'macroeconomic_indicators': [
        'GDP per capita (constant prices, PPP)',
        'Inflation, consumer prices (annual %)',
        'Share of consumption in GDP',
        'Share of investment in GDP',
        'Share of government spending in GDP',
        'Gross fixed capital formation, all, Constant prices, Percent of GDP',
        'Capital depreciation rate'
    ],
    
    'fiscal_financial': [
        'Government revenue',
        'Primary net lending, General government, Percent of GDP',
        'Adjusted savings: gross savings (% of GNI)',
        'Domestic credit to private sector (% of GDP)',
        'Lending interest rate (%)',
        'Real interest rate (%)',
        'Use of IMF credit (DOD, current US$)'
    ],
    
    'governance_institutions': [
        'Rule of law index',
        'Political corruption index',
        'Clientelism index',
        'Property rights',
        'Political stability — estimate'
    ],
    
    'human_capital_development': [
        'Human capital index',
        'Life expectancy at birth, total (years)',
        'Death rates, crude per 1000 people'
    ],
    
    'infrastructure_technology': [
        'Access to electricity (% of population)',
        'Mobile cellular subscriptions (per 100 people)'
    ],
    
    'structural_characteristics': [
        'Landlocked',
        'Urban population (% of total population)',
        'Population'
    ]
}

In [None]:
def create_descriptive_table(df, variable_groups, exclude_groups=['identifiers']):
    """
    Create descriptive statistics tables for each variable group
    
    Parameters:
    - df: your master dataframe
    - variable_groups: dictionary with theme names as keys and variable lists as values
    - exclude_groups: list of group names to skip
    """
    
    all_tables = {}
    
    for group_name, variables in variable_groups.items():
        if group_name in exclude_groups:
            continue
        
        # Get variables that exist in the dataframe
        existing_vars = [v for v in variables if v in df.columns]
        
        if not existing_vars:
            continue
        
        # Calculate statistics
        stats_data = []
        
        for var in existing_vars:
            stats_data.append({
                'Variable': var,
                'Mean': df[var].mean(),
                'Std': df[var].std(),
                'Median': df[var].median(),
                'Q1': df[var].quantile(0.25),
                'Q3': df[var].quantile(0.75),
                'N': df[var].count()
            })
        
        # Create dataframe for this group
        group_table = pd.DataFrame(stats_data)
        
        # Round numeric columns
        numeric_cols = ['Mean', 'Std', 'Median', 'Q1', 'Q3']
        group_table[numeric_cols] = group_table[numeric_cols].round(3)
        
        all_tables[group_name] = group_table
        
        # Display the table
        print(f"\n{'='*80}")
        print(f"{group_name.upper().replace('_', ' ')}")
        print(f"{'='*80}")
        print(group_table.to_string(index=False))
        print()
    
    return all_tables

# Usage
descriptive_tables = create_descriptive_table(master, variable_groups)


ECONOMIC COMPLEXITY INDEX
                 Variable  Mean   Std  Median     Q1    Q3    N
Economic Complexity Index -0.07 1.024  -0.209 -0.832 0.586 3150


RESOURCE DEPENDENCE
                                Variable         Mean          Std       Median           Q1           Q3    N
                    Oil rents (% of GDP) 5.318000e+00 1.137100e+01 1.990000e-01        0.000 3.333000e+00 3150
            Natural gas rents (% of GDP) 6.530000e-01 1.677000e+00 2.100000e-02        0.000 4.200000e-01 3150
                Mineral rents (% of GDP) 8.590000e-01 2.266000e+00 4.600000e-02        0.000 5.170000e-01 3150
Total natural resources rents (% of GDP) 8.948000e+00 1.222700e+01 3.580000e+00        0.807 1.224300e+01 3150
                        Total_Production 5.363860e+07 2.808798e+08 8.309639e+05    69940.994 7.673191e+06 3150
                          Total_Reserves 1.253761e+04 4.109445e+04 0.000000e+00        0.000 2.146692e+03 3138
                  Total_Production_Value 4.103

In [None]:
# Calculate statistics
eci_stats = master.groupby('Year')['Economic Complexity Index'].agg([
    ('Mean', 'mean'),
    ('Q1', lambda x: x.quantile(0.25)),
    ('Q3', lambda x: x.quantile(0.75))
]).reset_index()

fig = go.Figure()

# Add interquartile range as shaded area
fig.add_trace(go.Scatter(
    x=eci_stats['Year'].tolist() + eci_stats['Year'].tolist()[::-1],
    y=eci_stats['Q3'].tolist() + eci_stats['Q1'].tolist()[::-1],
    fill='toself',
    fillcolor='rgba(44, 62, 80, 0.15)',
    line=dict(color='rgba(255,255,255,0)'),
    showlegend=True,
    name='Interquartile Range',
    hoverinfo='skip'
))

# Add mean line
fig.add_trace(go.Scatter(
    x=eci_stats['Year'],
    y=eci_stats['Mean'],
    mode='lines+markers',
    line=dict(color='#2C3E50', width=2.5),
    marker=dict(size=6, color='#2C3E50'),
    name='Mean'
))

fig.update_layout(
    title={
        'text': 'Evolution of Economic Complexity Index',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 16, 'color': '#2C3E50'}
    },
    xaxis_title='Year',
    yaxis_title='Economic Complexity Index',
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=12, color='#2C3E50'),
    xaxis=dict(
        showgrid=True,
        gridcolor='#E5E5E5',
        showline=True,
        linecolor='#2C3E50',
        linewidth=1
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='#E5E5E5',
        showline=True,
        linecolor='#2C3E50',
        linewidth=1
    ),
    hovermode='x unified',
    width=900,
    height=500,
    legend=dict(
        x=0.02,
        y=0.98,
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='#E5E5E5',
        borderwidth=1
    )
)

fig.show()

Plot is not useful because ECI is normalized each year. 

## Heatmap

In [None]:
import plotly.graph_objects as go
import pandas as pd

def create_eci_correlation_heatmap(df, variable_groups, exclude_groups=['identifiers', 'economic_complexity_index']):
    """
    Create a heatmap showing ECI correlation with all variables, grouped by theme
    """
    
    # Collect all variables with their themes and correlations
    data_list = []
    
    for group_name, vars_list in variable_groups.items():
        if group_name in exclude_groups:
            continue
        
        existing_vars = [v for v in vars_list if v in df.columns]
        
        for var in existing_vars:
            # Calculate correlation
            corr = df[['Economic Complexity Index', var]].corr().iloc[0, 1]
            
            data_list.append({
                'Theme': group_name.replace('_', ' ').title(),
                'Variable': var,
                'Correlation': corr
            })
    
    # Create dataframe
    corr_df = pd.DataFrame(data_list)
    
    # Sort by theme first, then by correlation within each theme
    corr_df = corr_df.sort_values(['Theme', 'Correlation'], ascending=[True, False])
    
    # Reset index to get proper ordering
    corr_df = corr_df.reset_index(drop=True)
    
    # Create the heatmap data
    z_values = corr_df['Correlation'].values.reshape(-1, 1)
    y_labels = corr_df['Variable'].tolist()
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=z_values,
        y=y_labels,
        x=['Correlation with ECI'],
        colorscale=[
            [0, '#D73027'],      # Strong negative (red)
            [0.25, '#FC8D59'],   # Moderate negative (orange)
            [0.5, '#FFFFBF'],    # Near zero (light yellow)
            [0.75, '#91BFDB'],   # Moderate positive (light blue)
            [1, '#4575B4']       # Strong positive (dark blue)
        ],
        zmid=0,
        text=z_values,
        texttemplate='%{text:.3f}',
        textfont={"size": 9},
        colorbar=dict(
            title="Correlation",
            tickmode="linear",
            tick0=-1,
            dtick=0.25,
            thickness=15,
            len=0.7
        ),
        hovertemplate='<b>%{y}</b><br>Correlation: %{z:.3f}<extra></extra>'
    ))
    
    # Find theme boundaries and create separators
    theme_boundaries = []
    theme_midpoints = []
    theme_labels = []
    
    current_theme = None
    theme_start = 0
    
    for idx, row in corr_df.iterrows():
        if current_theme != row['Theme']:
            if current_theme is not None:
                # Add boundary line
                theme_boundaries.append(idx - 0.5)
                # Calculate midpoint for previous theme
                theme_midpoints.append((theme_start + idx - 1) / 2)
                theme_labels.append(current_theme)
            
            current_theme = row['Theme']
            theme_start = idx
    
    # Add last theme
    if current_theme is not None:
        theme_midpoints.append((theme_start + len(corr_df) - 1) / 2)
        theme_labels.append(current_theme)
    
    # Update layout
    fig.update_layout(
        title={
            'text': 'Economic Complexity Index - Correlation Matrix by Theme',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16, 'color': '#2C3E50', 'family': 'Arial, sans-serif'}
        },
        xaxis=dict(
            side='top',
            showgrid=False,
            showline=True,
            linecolor='#2C3E50',
            linewidth=1
        ),
        yaxis=dict(
            showgrid=False,
            showline=True,
            linecolor='#2C3E50',
            linewidth=1,
            tickfont=dict(size=9),
            autorange='reversed'  # This ensures top to bottom ordering
        ),
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family="Arial, sans-serif", size=11, color='#2C3E50'),
        height=max(800, len(y_labels) * 18),
        width=900,
        margin=dict(l=350, r=150, t=100, b=50)
    )
    
    # Add horizontal lines to separate themes
    shapes = []
    for boundary in theme_boundaries:
        shapes.append(dict(
            type='line',
            x0=-0.55, x1=0.55,
            y0=boundary, y1=boundary,
            line=dict(color='#2C3E50', width=2),
            xref='x',
            yref='y'
        ))
    
    fig.update_layout(shapes=shapes)
    
    # Add theme labels on the left side
    annotations = []
    for label, midpoint in zip(theme_labels, theme_midpoints):
        annotations.append(dict(
            x=-0.75,
            y=midpoint,
            text=f'<b>{label}</b>',
            showarrow=False,
            xanchor='right',
            yanchor='middle',
            font=dict(size=11, color='#2C3E50', family='Arial, sans-serif'),
            xref='x',
            yref='y',
            textangle=0
        ))
    
    fig.update_layout(annotations=annotations)
    
    return fig, corr_df

# Create the heatmap
fig, corr_data = create_eci_correlation_heatmap(master, variable_groups)
fig.show()

# Print summary statistics
print("\nCorrelation Summary by Theme:")
print(corr_data.groupby('Theme')['Correlation'].agg(['mean', 'min', 'max']).round(3))

print("\n\nTop 10 Positive Correlations with ECI:")
print(corr_data.nlargest(10, 'Correlation')[['Variable', 'Theme', 'Correlation']].to_string(index=False))

print("\n\nTop 10 Negative Correlations with ECI:")
print(corr_data.nsmallest(10, 'Correlation')[['Variable', 'Theme', 'Correlation']].to_string(index=False))

# Save
# fig.write_html('eci_correlation_heatmap.html')
# fig.write_image('eci_correlation_heatmap.png', width=900, height=1400, scale=2)


Correlation Summary by Theme:
                             mean    min    max
Theme                                          
Economic Structure          0.105 -0.552  0.631
Fiscal Financial            0.122 -0.297  0.672
Governance Institutions     0.088 -0.675  0.668
Human Capital Development   0.457 -0.048  0.739
Infrastructure Technology   0.481  0.349  0.613
Macroeconomic Indicators    0.110 -0.223  0.597
Resource Dependence        -0.079 -0.472  0.161
Structural Characteristics  0.081 -0.072  0.205


Top 10 Positive Correlations with ECI:
                                    Variable                     Theme  Correlation
                         Human capital index Human Capital Development     0.739458
     Life expectancy at birth, total (years) Human Capital Development     0.678240
Domestic credit to private sector (% of GDP)          Fiscal Financial     0.672480
                           Rule of law index   Governance Institutions     0.667558
                            

# High Natural Resource Countries

In [None]:
high_resource_countries = master[
    ((master['Total natural resources rents (% of GDP)'] >= 5) & 
     (master['Year'] == 1995) & 
     (master['Income1995'] != 'High-income countries')) |
    ((master['Country Name'].isin(['United Arab Emirates', 'Kuwait', 'Qatar'])) & 
     (master['Year'] == 1995))
]

In [None]:
high_resource_countries['Country Name'].unique()

array(['Angola', 'United Arab Emirates', 'Azerbaijan', 'Burkina Faso',
       'Bahrain', 'Bolivia', 'Chile', "Cote d'Ivoire", 'Cameroon',
       'Congo, Dem. Rep.', 'Congo, Rep.', 'Algeria', 'Ecuador',
       'Egypt, Arab Rep.', 'Ethiopia', 'Gabon', 'Ghana', 'Guinea',
       'Equatorial Guinea', 'Indonesia', 'Iran, Islamic Rep.', 'Iraq',
       'Kazakhstan', 'Kenya', 'Kuwait', 'Lao PDR', 'Liberia', 'Libya',
       'Madagascar', 'Mali', 'Myanmar', 'Mongolia', 'Mozambique',
       'Malawi', 'Malaysia', 'Niger', 'Nigeria', 'Oman',
       'Papua New Guinea', 'Qatar', 'Russian Federation', 'Rwanda',
       'Saudi Arabia', 'Chad', 'Togo', 'Trinidad and Tobago', 'Tanzania',
       'Uganda', 'Uzbekistan', 'Venezuela, RB', 'Viet Nam', 'Yemen, Rep.',
       'Zambia', 'Zimbabwe'], dtype=object)

In [None]:
high_resource_countries['Country Name'].nunique()

54

In [None]:
high_resource_countries['Country Code'].unique()

array(['AGO', 'ARE', 'AZE', 'BFA', 'BHR', 'BOL', 'CHL', 'CIV', 'CMR',
       'COD', 'COG', 'DZA', 'ECU', 'EGY', 'ETH', 'GAB', 'GHA', 'GIN',
       'GNQ', 'IDN', 'IRN', 'IRQ', 'KAZ', 'KEN', 'KWT', 'LAO', 'LBR',
       'LBY', 'MDG', 'MLI', 'MMR', 'MNG', 'MOZ', 'MWI', 'MYS', 'NER',
       'NGA', 'OMN', 'PNG', 'QAT', 'RUS', 'RWA', 'SAU', 'TCD', 'TGO',
       'TTO', 'TZA', 'UGA', 'UZB', 'VEN', 'VNM', 'YEM', 'ZMB', 'ZWE'],
      dtype=object)

In [None]:
high_resource_countries_df = master[master['Country Code'].isin(['AGO', 'ARE', 'AZE', 'BFA', 'BHR', 'BOL', 'CHL', 'CIV', 'CMR',
       'COD', 'COG', 'DZA', 'ECU', 'EGY', 'ETH', 'GAB', 'GHA', 'GIN',
       'GNQ', 'IDN', 'IRN', 'IRQ', 'KAZ', 'KEN', 'KWT', 'LAO', 'LBR',
       'LBY', 'MDG', 'MLI', 'MMR', 'MNG', 'MOZ', 'MWI', 'MYS', 'NER',
       'NGA', 'OMN', 'PNG', 'QAT', 'RUS', 'RWA', 'SAU', 'TCD', 'TGO',
       'TTO', 'TZA', 'UGA', 'UZB', 'VEN', 'VNM', 'YEM', 'ZMB', 'ZWE'])]

In [None]:
high_resource_countries_df.to_csv("C:/Users/emili/OneDrive/Documentos/LSE MPA DSPP/AT 2025/Capstone/Master Data/high_resource_countries.csv")

In [None]:
#cluster_list = pd.read_csv("C:/Users/emili/OneDrive/Documentos/LSE MPA DSPP/AT 2025/Capstone/Descriptive Statistics/clusters_list.csv")

In [None]:
natural_resource_country_number = high_resource_countries_df['Country Name'].nunique()
natural_resource_year_number = high_resource_countries_df['Year'].nunique()

In [None]:
f'We have a master data set with {natural_resource_country_number} countries and {natural_resource_year_number} years'

'We have a master data set with 54 countries and 25 years'

In [None]:
descriptive_tables = create_descriptive_table(high_resource_countries_df, variable_groups)


ECONOMIC COMPLEXITY INDEX
                 Variable  Mean  Std  Median    Q1     Q3    N
Economic Complexity Index -0.76 0.67  -0.804 -1.22 -0.343 1350


RESOURCE DEPENDENCE
                                Variable         Mean          Std       Median           Q1           Q3    N
                    Oil rents (% of GDP) 1.128900e+01 1.518600e+01 3.425000e+00        0.000 1.879700e+01 1350
            Natural gas rents (% of GDP) 1.161000e+00 2.118000e+00 1.710000e-01        0.000 1.535000e+00 1350
                Mineral rents (% of GDP) 1.263000e+00 2.794000e+00 5.500000e-02        0.000 8.390000e-01 1350
Total natural resources rents (% of GDP) 1.791300e+01 1.364700e+01 1.359600e+01        7.513 2.441000e+01 1350
                        Total_Production 1.598696e+07 6.447241e+07 7.234320e+05    86157.000 3.099438e+06 1350
                          Total_Reserves 2.378625e+04 5.644884e+04 5.238650e+02        0.000 7.144807e+03 1340
                  Total_Production_Value 2.07973

In [None]:
# Create the heatmap
fig, corr_data = create_eci_correlation_heatmap(high_resource_countries_df, variable_groups)
fig.show()

# Print summary statistics
print("\nCorrelation Summary by Theme:")
print(corr_data.groupby('Theme')['Correlation'].agg(['mean', 'min', 'max']).round(3))

print("\n\nTop 10 Positive Correlations with ECI:")
print(corr_data.nlargest(10, 'Correlation')[['Variable', 'Theme', 'Correlation']].to_string(index=False))

print("\n\nTop 10 Negative Correlations with ECI:")
print(corr_data.nsmallest(10, 'Correlation')[['Variable', 'Theme', 'Correlation']].to_string(index=False))

fig.write_html('C:/Users/emili/OneDrive/Documentos/LSE MPA DSPP/AT 2025/Capstone/Descriptive Statistics/eci_correlation_heatmap.html')


Correlation Summary by Theme:
                             mean    min    max
Theme                                          
Economic Structure          0.118 -0.309  0.327
Fiscal Financial            0.128 -0.278  0.582
Governance Institutions     0.010 -0.393  0.316
Human Capital Development   0.211 -0.387  0.512
Infrastructure Technology   0.405  0.314  0.497
Macroeconomic Indicators    0.060 -0.216  0.430
Resource Dependence         0.097 -0.132  0.325
Structural Characteristics  0.089 -0.066  0.221


Top 10 Positive Correlations with ECI:
                                      Variable                     Theme  Correlation
  Domestic credit to private sector (% of GDP)          Fiscal Financial     0.581617
                           Human capital index Human Capital Development     0.512310
       Life expectancy at birth, total (years) Human Capital Development     0.507606
       Access to electricity (% of population) Infrastructure Technology     0.497311
         GDP per c

# ECI evolution resource rich vs non resource rich

In [None]:
# Compare trajectories
fig = go.Figure()

# high resource countries
high_resource = high_resource_countries_df.groupby('Year')['Economic Complexity Index'].mean()
fig.add_trace(go.Scatter(x=high_resource.index, y=high_resource.values, 
                         mode='lines+markers', name='High Resource Countries',
                         line=dict(color='#E74C3C', width=2.5)))

# non resource countries
non_resource = master[~master['Country Name'].isin(high_resource_countries_df)].groupby('Year')['Economic Complexity Index'].mean()
fig.add_trace(go.Scatter(x=non_resource.index, y=non_resource.values,
                         mode='lines+markers', name='Other Countries',
                         line=dict(color='#3498DB', width=2.5)))

fig.update_layout(title='ECI Evolution: Resource-Rich vs Other Countries',
                  xaxis_title='Year', yaxis_title='Average ECI',
                  plot_bgcolor='white', paper_bgcolor='white')
fig.show()
fig.write_html('C:/Users/emili/OneDrive/Documentos/LSE MPA DSPP/AT 2025/Capstone/Descriptive Statistics/eci_evolution_resource_rich_vs_other.html')

# Scatter: Resource Dependence vs ECI (with country labels)

In [None]:
# Take a snapshot year (e.g., 2015 or latest available)
snapshot = high_resource_countries_df[high_resource_countries_df['Year'] == 2015].copy()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=snapshot['Total natural resources rents (% of GDP)'],
    y=snapshot['Economic Complexity Index'],
    mode='markers+text',
    text=snapshot['Country Name'],
    textposition='top center',
    textfont=dict(size=8),
    marker=dict(size=8, color='#2C3E50', opacity=0.6),
    hovertemplate='<b>%{text}</b><br>Resource Rents: %{x:.1f}%<br>ECI: %{y:.2f}<extra></extra>'
))

fig.update_layout(
    title='Resource Dependence vs Economic Complexity (2015)',
    xaxis_title='Total Natural Resources Rents (% of GDP)',
    yaxis_title='Economic Complexity Index',
    plot_bgcolor='white', paper_bgcolor='white',
    height=700, width=1000
)
fig.show()
fig.write_html('C:/Users/emili/OneDrive/Documentos/LSE MPA DSPP/AT 2025/Capstone/Descriptive Statistics/resource dependence vs economic complexity.html')

# Summary Statistics by Resource Type

In [None]:
# Compare key indicators across resource types
resource_comparison = []

for resource_type in ['Hydrocarbons_Dominant', 'Subsoil_Metals_Dominant', 'Precious_Metals_Dominant']:
    subset = high_resource_countries_df[high_resource_countries_df[resource_type] == 1]
    
    resource_comparison.append({
        'Resource Type': resource_type.replace('_', ' ').replace(' Dominant', ''),
        'N Countries': subset['Country Name'].nunique(),
        'Avg ECI': subset['Economic Complexity Index'].mean(),
        'Avg Manufacturing %': subset['Manufacturing'].mean(),
        'Avg Resource Rents %': subset['Total natural resources rents (% of GDP)'].mean(),
        'Avg GDP per capita': subset['GDP per capita (constant prices, PPP)'].mean()
    })

comparison_table = pd.DataFrame(resource_comparison)
print(comparison_table.round(2))

     Resource Type  N Countries  Avg ECI  Avg Manufacturing %  \
0     Hydrocarbons           25    -0.85                11.08   
1   Subsoil Metals           26    -0.65                12.89   
2  Precious Metals           27    -0.86                10.27   

   Avg Resource Rents %  Avg GDP per capita  
0                 27.88            18263.88  
1                 16.04            18141.73  
2                 10.98             3445.14  


In [None]:
comparison_table = pd.DataFrame(resource_comparison)

styled_table = (comparison_table.style
    .format({
        'N Countries': '{:.0f}',
        'Avg ECI': '{:.2f}',
        'Avg Manufacturing %': '{:.1f}%',
        'Avg Resource Rents %': '{:.1f}%',
        'Avg GDP per capita': '${:,.0f}'
    })
    .set_properties(**{
        'text-align': 'center',
        'font-size': '11pt'
    })
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#2C5F8D'), 
                                      ('color', 'white'),
                                      ('font-weight', 'bold'),
                                      ('text-align', 'center')]},
        {'selector': 'td', 'props': [('border', '1px solid #ddd')]},
        {'selector': 'tr:hover', 'props': [('background-color', '#f5f5f5')]}
    ])
    .hide(axis='index')
)

styled_table

Resource Type,N Countries,Avg ECI,Avg Manufacturing %,Avg Resource Rents %,Avg GDP per capita
Hydrocarbons,25,-0.85,11.1%,27.9%,"$18,264"
Subsoil Metals,26,-0.65,12.9%,16.0%,"$18,142"
Precious Metals,27,-0.86,10.3%,11.0%,"$3,445"


# Winners vs Losers Table

In [None]:
# Countries with biggest ECI gains/losses
eci_change = high_resource_countries_df.groupby('Country Name').agg({
    'Economic Complexity Index': lambda x: x.iloc[-1] - x.iloc[0] if len(x) > 1 else None,
    'Year': ['min', 'max'],
    'Total natural resources rents (% of GDP)': 'mean'
}).dropna()

eci_change.columns = ['ECI_Change', 'Start_Year', 'End_Year', 'Avg_Resource_Rents']
eci_change = eci_change.sort_values('ECI_Change', ascending=False)

print("Top 10 ECI Improvers (Resource-Rich):")
print(eci_change[eci_change['Avg_Resource_Rents'] >= 10].head(10))

print("\nTop 10 ECI Decliners (Resource-Rich):")
print(eci_change[eci_change['Avg_Resource_Rents'] >= 10].tail(10))

Top 10 ECI Improvers (Resource-Rich):
                      ECI_Change  Start_Year  End_Year  Avg_Resource_Rents
Country Name                                                              
Papua New Guinea           1.011        1995      2019           20.087218
Bahrain                    0.903        1995      2019           19.478799
Malaysia                   0.876        1995      2019           10.218629
Iran, Islamic Rep.         0.855        1995      2019           24.180912
Angola                     0.710        1995      2019           35.412467
Kuwait                     0.676        1995      2019           44.748756
Qatar                      0.660        1995      2019           33.329006
Nigeria                    0.609        1995      2019           14.302003
United Arab Emirates       0.449        1995      2019           20.623888
Ethiopia                   0.327        1995      2019           19.287381

Top 10 ECI Decliners (Resource-Rich):
               ECI_Chan

# Convergence/Divergence Plot

In [None]:
# Track individual high-resource countries over time
fig = go.Figure()

for country in ['Bahrain', 'Malaysia', 'Chile', 'Saudi Arabia', 'Nigeria', 'Algeria', 'Iraq']:
    country_data = high_resource_countries_df[high_resource_countries_df['Country Name'] == country]
    if len(country_data) > 0:
        fig.add_trace(go.Scatter(
            x=country_data['Year'],
            y=country_data['Economic Complexity Index'],
            mode='lines+markers',
            name=country,
            line=dict(width=2)
        ))

fig.update_layout(
    title='ECI Trajectories: Selected Countries',
    xaxis_title='Year', yaxis_title='Economic Complexity Index',
    plot_bgcolor='white', paper_bgcolor='white',
    height=600, width=1000
)
fig.show()
fig.write_html('C:/Users/emili/OneDrive/Documentos/LSE MPA DSPP/AT 2025/Capstone/Descriptive Statistics/eci_evolution_selected_countries.html')

# Regressions

In [None]:
high_resource_countries_df=high_resource_countries_df.drop(columns=['Unnamed: 0'])

In [None]:
high_resource_countries_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1350 entries, 0 to 3149
Data columns (total 48 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   Country Code                                                         1350 non-null   object 
 1   Country Name                                                         1350 non-null   object 
 2   Year                                                                 1350 non-null   int64  
 3   Access to electricity (% of population)                              1350 non-null   float64
 4   Adjusted savings: gross savings (% of GNI)                           1350 non-null   float64
 5   Agriculture                                                          1350 non-null   float64
 6   Capital depreciation rate                                            1350 non-null   float64
 7   Clientelism

## Kitchen Sink

In [104]:
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

# Define dependent variable
y = high_resource_countries_df['Economic Complexity Index']

# Define independent variables (excluding identifiers and the dependent variable)
independent_vars = [
    'Access to electricity (% of population)',
    'Adjusted savings: gross savings (% of GNI)',
    'Agriculture',
    'Capital depreciation rate',
    'Clientelism index',
    'Death rates, crude per 1000 people',
    'Domestic credit to private sector (% of GDP)',
    'GDP per capita (constant prices, PPP)',
    'Government revenue',
    'Gross fixed capital formation, all, Constant prices, Percent of GDP',
    'Human capital index',
    'Industry',
    'Inflation, consumer prices (annual %)',
    'Landlocked',
    'Lending interest rate (%)',
    'Life expectancy at birth, total (years)',
    'Manufacturing',
    'Mineral rents (% of GDP)',
    'Mobile cellular subscriptions (per 100 people)',
    'Natural gas rents (% of GDP)',
    'Oil rents (% of GDP)',
    'Political corruption index',
    'Political stability — estimate',
    'Primary net lending, General government, Percent of GDP',
    'Property rights',
    'Real interest rate (%)',
    'Rule of law index',
    'Services',
    'Share of consumption in GDP',
    'Share of government spending in GDP',
    'Share of investment in GDP',
    'Total natural resources rents (% of GDP)',
    'Trade (% of GDP)',
    'Urban population (% of total population)',
    'Use of IMF credit (DOD, current US$)',
    'Total_Production',
    'Total_Reserves',
    'Total_Production_Value',
    'Total_Reserves_Value',
    'Hydrocarbons_Dominant',
    'Subsoil_Metals_Dominant',
    'Precious_Metals_Dominant',
    'Population'
]

# Create X matrix
X = high_resource_countries_df[independent_vars]

# Add constant
X = sm.add_constant(X)

# Run OLS regression
model = sm.OLS(y, X, missing='drop')  # 'drop' handles any remaining missing values
results = model.fit()

# Display results
print(results.summary())

# For a cleaner presentation
print("\n" + "="*80)
print("KEY STATISTICS")
print("="*80)
print(f"R-squared: {results.rsquared:.4f}")
print(f"Adjusted R-squared: {results.rsquared_adj:.4f}")
print(f"F-statistic: {results.fvalue:.2f}")
print(f"Prob (F-statistic): {results.f_pvalue:.4e}")
print(f"N observations: {results.nobs:.0f}")

                                OLS Regression Results                               
Dep. Variable:     Economic Complexity Index   R-squared:                       0.630
Model:                                   OLS   Adj. R-squared:                  0.618
Method:                        Least Squares   F-statistic:                     51.28
Date:                       Sat, 24 Jan 2026   Prob (F-statistic):          9.24e-246
Time:                               11:03:56   Log-Likelihood:                -701.14
No. Observations:                       1340   AIC:                             1490.
Df Residuals:                           1296   BIC:                             1719.
Df Model:                                 43                                         
Covariance Type:                   nonrobust                                         
                                                                          coef    std err          t      P>|t|      [0.025      0.975]
----

In [105]:
# Extract significant coefficients (p < 0.05)
significant_vars = results.pvalues[results.pvalues < 0.05].sort_values()
print("\n" + "="*80)
print("SIGNIFICANT VARIABLES (p < 0.05)")
print("="*80)
for var, pval in significant_vars.items():
    if var != 'const':
        coef = results.params[var]
        print(f"{var:50s} | Coef: {coef:8.4f} | p-value: {pval:.4f}")


SIGNIFICANT VARIABLES (p < 0.05)
Domestic credit to private sector (% of GDP)       | Coef:   0.0106 | p-value: 0.0000
Capital depreciation rate                          | Coef:  -7.8366 | p-value: 0.0000
Use of IMF credit (DOD, current US$)               | Coef:   0.0000 | p-value: 0.0000
Total_Production                                   | Coef:   0.0000 | p-value: 0.0000
Manufacturing                                      | Coef:   0.0154 | p-value: 0.0000
Mineral rents (% of GDP)                           | Coef:  -0.0324 | p-value: 0.0000
Human capital index                                | Coef:   0.2007 | p-value: 0.0000
Death rates, crude per 1000 people                 | Coef:  -0.0501 | p-value: 0.0000
Total_Reserves                                     | Coef:   0.0000 | p-value: 0.0000
Primary net lending, General government, Percent of GDP | Coef:   0.0077 | p-value: 0.0002
Gross fixed capital formation, all, Constant prices, Percent of GDP | Coef:   0.0065 | p-value: 0.000

In [None]:
human capital index
gross fixed capital formation
political corruption index
political stability index
rule of law index
property rights index
GDP per capita (constant prices, PPP)
Total natural resources rents (% of GDP)
Trade (% of GDP) 

## First Regression

In [107]:
import statsmodels.api as sm
import pandas as pd

# Create interaction terms
high_resource_countries_df['HCI_x_ResourceRents'] = (
    high_resource_countries_df['Human capital index'] * 
    high_resource_countries_df['Total natural resources rents (% of GDP)']
)

high_resource_countries_df['GrossCapForm_x_ResourceRents'] = (
    high_resource_countries_df['Gross fixed capital formation, all, Constant prices, Percent of GDP'] * 
    high_resource_countries_df['Total natural resources rents (% of GDP)']
)

# Define independent variables
independent_vars = [
    'Human capital index',
    'Gross fixed capital formation, all, Constant prices, Percent of GDP',
    'Political corruption index',
    'Political stability — estimate',
    'Rule of law index',
    'Property rights',
    'GDP per capita (constant prices, PPP)',
    'Total natural resources rents (% of GDP)',
    'Trade (% of GDP)',
    'HCI_x_ResourceRents',
    'GrossCapForm_x_ResourceRents'
]

# Create a clean dataset - IMPORTANT: ensure all numeric
regression_data = high_resource_countries_df[['Economic Complexity Index', 'Country Code'] + independent_vars].copy()

# Convert all independent variables to numeric (just in case)
for var in independent_vars:
    regression_data[var] = pd.to_numeric(regression_data[var], errors='coerce')

# Drop any rows with missing values
regression_data = regression_data.dropna()

# Define y and X from clean data
y = regression_data['Economic Complexity Index']
X = regression_data[independent_vars]

# Create country dummies from clean data
country_dummies = pd.get_dummies(regression_data['Country Code'], prefix='Country', drop_first=True)

# Ensure country dummies are numeric (they should be, but just in case)
country_dummies = country_dummies.astype(float)

# Combine X with country dummies
X = X.join(country_dummies)

# Add constant
X = sm.add_constant(X)

# Run OLS
model = sm.OLS(y, X)
results = model.fit(cov_type='HC1')

# Display results
print(results.summary())

# Display main variables
print("\n" + "="*80)
print("MAIN VARIABLES & INTERACTIONS")
print("="*80)
main_vars = independent_vars + ['const']
for var in main_vars:
    if var in results.params.index:
        coef = results.params[var]
        se = results.bse[var]
        tstat = results.tvalues[var]
        pval = results.pvalues[var]
        sig = '***' if pval < 0.01 else '**' if pval < 0.05 else '*' if pval < 0.10 else ''
        print(f"{var:60s} | Coef: {coef:8.4f} | SE: {se:6.4f} | t: {tstat:6.2f} | p: {pval:.4f} {sig}")

print("\n" + "="*80)
print("MODEL FIT")
print("="*80)
print(f"R-squared: {results.rsquared:.4f}")
print(f"Adjusted R-squared: {results.rsquared_adj:.4f}")
print(f"N observations: {results.nobs:.0f}")
print(f"N countries: {regression_data['Country Code'].nunique()}")

                                OLS Regression Results                               
Dep. Variable:     Economic Complexity Index   R-squared:                       0.778
Model:                                   OLS   Adj. R-squared:                  0.767
Method:                        Least Squares   F-statistic:                     130.4
Date:                       Sat, 24 Jan 2026   Prob (F-statistic):               0.00
Time:                               11:36:43   Log-Likelihood:                -358.73
No. Observations:                       1350   AIC:                             847.5
Df Residuals:                           1285   BIC:                             1186.
Df Model:                                 64                                         
Covariance Type:                         HC1                                         
                                                                          coef    std err          z      P>|z|      [0.025      0.975]
----

Interaction of HCI with natural resource rents is significant

## Add country fixed effects

In [113]:
# If Year wasn't in your original regression_data, add it:
regression_data = high_resource_countries_df[['Economic Complexity Index', 'Country Code', 'Year'] + independent_vars].copy()

# Convert all independent variables to numeric
for var in independent_vars:
    regression_data[var] = pd.to_numeric(regression_data[var], errors='coerce')

# Drop missing values
regression_data = regression_data.dropna()

# Now create both sets of dummies from clean data
country_dummies = pd.get_dummies(regression_data['Country Code'], prefix='Country', drop_first=True).astype(float)
year_dummies = pd.get_dummies(regression_data['Year'], prefix='Year', drop_first=True).astype(float)

# Create year dummies from the clean regression_data (not original dataframe)
year_dummies = pd.get_dummies(regression_data['Year'], prefix='Year', drop_first=True)

# Ensure year dummies are numeric
year_dummies = year_dummies.astype(float)

# Rebuild X with both country and year fixed effects
X_twoway = regression_data[independent_vars].join(country_dummies).join(year_dummies)

# Add constant
X_twoway = sm.add_constant(X_twoway)

# Define y from clean data
y_twoway = regression_data['Economic Complexity Index']

# Run OLS with two-way fixed effects
model_twoway = sm.OLS(y_twoway, X_twoway)
results_twoway = model_twoway.fit(cov_type='HC1')

print(results_twoway.summary())

# Display main variables (excluding fixed effects for readability)
print("\n" + "="*80)
print("MAIN VARIABLES & INTERACTIONS (Two-Way Fixed Effects)")
print("="*80)
main_vars = independent_vars + ['const']
for var in main_vars:
    if var in results_twoway.params.index:
        coef = results_twoway.params[var]
        se = results_twoway.bse[var]
        tstat = results_twoway.tvalues[var]
        pval = results_twoway.pvalues[var]
        sig = '***' if pval < 0.01 else '**' if pval < 0.05 else '*' if pval < 0.10 else ''
        print(f"{var:60s} | Coef: {coef:8.4f} | SE: {se:6.4f} | t: {tstat:6.2f} | p: {pval:.4f} {sig}")

print("\n" + "="*80)
print("MODEL FIT")
print("="*80)
print(f"R-squared: {results_twoway.rsquared:.4f}")
print(f"Adjusted R-squared: {results_twoway.rsquared_adj:.4f}")
print(f"N observations: {results_twoway.nobs:.0f}")
print(f"N countries: {regression_data['Country Code'].nunique()}")
print(f"N years: {regression_data['Year'].nunique()}")

                                OLS Regression Results                               
Dep. Variable:     Economic Complexity Index   R-squared:                       0.783
Model:                                   OLS   Adj. R-squared:                  0.768
Method:                        Least Squares   F-statistic:                     95.80
Date:                       Sat, 24 Jan 2026   Prob (F-statistic):               0.00
Time:                               11:43:18   Log-Likelihood:                -341.37
No. Observations:                       1350   AIC:                             860.7
Df Residuals:                           1261   BIC:                             1324.
Df Model:                                 88                                         
Covariance Type:                         HC1                                         
                                                                          coef    std err          z      P>|z|      [0.025      0.975]
----

## Lagged Variable

In [114]:
# Sort data by country and year to ensure proper lagging
regression_data = regression_data.sort_values(['Country Code', 'Year'])

# Create lagged ECI (t-1)
regression_data['ECI_lag1'] = regression_data.groupby('Country Code')['Economic Complexity Index'].shift(1)

# Drop rows where lagged ECI is missing (first year for each country)
regression_data = regression_data.dropna(subset=['ECI_lag1'])

# Add lagged ECI to independent variables list
independent_vars_lagged = independent_vars + ['ECI_lag1']

# Define y and X from updated data
y = regression_data['Economic Complexity Index']
X = regression_data[independent_vars_lagged]

# Create country and year dummies from updated data
country_dummies = pd.get_dummies(regression_data['Country Code'], prefix='Country', drop_first=True).astype(float)
year_dummies = pd.get_dummies(regression_data['Year'], prefix='Year', drop_first=True).astype(float)

# Combine X with fixed effects
X_twoway = X.join(country_dummies).join(year_dummies)

# Add constant
X_twoway = sm.add_constant(X_twoway)

# Run OLS with two-way fixed effects
model_twoway = sm.OLS(y, X_twoway)
results_twoway = model_twoway.fit(cov_type='HC1')

print(results_twoway.summary())

# Display main variables
print("\n" + "="*80)
print("MAIN VARIABLES & INTERACTIONS (Two-Way FE with Lagged ECI)")
print("="*80)
main_vars = independent_vars_lagged + ['const']
for var in main_vars:
    if var in results_twoway.params.index:
        coef = results_twoway.params[var]
        se = results_twoway.bse[var]
        tstat = results_twoway.tvalues[var]
        pval = results_twoway.pvalues[var]
        sig = '***' if pval < 0.01 else '**' if pval < 0.05 else '*' if pval < 0.10 else ''
        print(f"{var:60s} | Coef: {coef:8.4f} | SE: {se:6.4f} | t: {tstat:6.2f} | p: {pval:.4f} {sig}")

print("\n" + "="*80)
print("MODEL FIT")
print("="*80)
print(f"R-squared: {results_twoway.rsquared:.4f}")
print(f"Adjusted R-squared: {results_twoway.rsquared_adj:.4f}")
print(f"N observations: {results_twoway.nobs:.0f}")
print(f"N countries: {regression_data['Country Code'].nunique()}")
print(f"N years: {regression_data['Year'].nunique()}")

                                OLS Regression Results                               
Dep. Variable:     Economic Complexity Index   R-squared:                       0.831
Model:                                   OLS   Adj. R-squared:                  0.819
Method:                        Least Squares   F-statistic:                     136.2
Date:                       Sat, 24 Jan 2026   Prob (F-statistic):               0.00
Time:                               13:58:19   Log-Likelihood:                -167.94
No. Observations:                       1296   AIC:                             513.9
Df Residuals:                           1207   BIC:                             973.7
Df Model:                                 88                                         
Covariance Type:                         HC1                                         
                                                                          coef    std err          z      P>|z|      [0.025      0.975]
----

## Add structural controls (electricity, urban population, death rates, type of resource)

In [117]:
# Create interaction terms in original dataframe first
high_resource_countries_df['HCI_x_ResourceRents'] = (
    high_resource_countries_df['Human capital index'] * 
    high_resource_countries_df['Total natural resources rents (% of GDP)']
)

high_resource_countries_df['GrossCapForm_x_ResourceRents'] = (
    high_resource_countries_df['Gross fixed capital formation, all, Constant prices, Percent of GDP'] * 
    high_resource_countries_df['Total natural resources rents (% of GDP)']
)

# Define ALL variables we need
all_vars = [
    'Economic Complexity Index',
    'Country Code',
    'Year',
    'Human capital index',
    'Gross fixed capital formation, all, Constant prices, Percent of GDP',
    'Political corruption index',
    'Political stability — estimate',
    'Rule of law index',
    'Property rights',
    'GDP per capita (constant prices, PPP)',
    'Total natural resources rents (% of GDP)',
    'Trade (% of GDP)',
    'HCI_x_ResourceRents',
    'GrossCapForm_x_ResourceRents',
    'Hydrocarbons_Dominant',
    'Subsoil_Metals_Dominant',
    'Precious_Metals_Dominant',
    'Urban population (% of total population)',
    'Landlocked',
    'Access to electricity (% of population)',
    'Death rates, crude per 1000 people'
]

# Create clean regression dataset with all variables
regression_data = high_resource_countries_df[all_vars].copy()

# Convert numeric variables to numeric
numeric_vars = [v for v in all_vars if v not in ['Country Code']]
for var in numeric_vars:
    if var != 'Year':
        regression_data[var] = pd.to_numeric(regression_data[var], errors='coerce')

# Drop missing values
regression_data = regression_data.dropna()

# Sort by country and year
regression_data = regression_data.sort_values(['Country Code', 'Year'])

# Create lagged ECI
regression_data['ECI_lag1'] = regression_data.groupby('Country Code')['Economic Complexity Index'].shift(1)

# Drop rows where lagged ECI is missing
regression_data = regression_data.dropna(subset=['ECI_lag1'])

# Define independent variables
independent_vars_expanded = [
    'Human capital index',
    'Gross fixed capital formation, all, Constant prices, Percent of GDP',
    'Political corruption index',
    'Political stability — estimate',
    'Rule of law index',
    'Property rights',
    'GDP per capita (constant prices, PPP)',
    'Total natural resources rents (% of GDP)',
    'Trade (% of GDP)',
    'HCI_x_ResourceRents',
    'GrossCapForm_x_ResourceRents',
    'ECI_lag1',
    'Hydrocarbons_Dominant',
    'Subsoil_Metals_Dominant',
    'Precious_Metals_Dominant',
    'Urban population (% of total population)',
    'Landlocked',
    'Access to electricity (% of population)',
    'Death rates, crude per 1000 people'
]

# Define y and X
y = regression_data['Economic Complexity Index']
X = regression_data[independent_vars_expanded]

# Create country and year dummies
country_dummies = pd.get_dummies(regression_data['Country Code'], prefix='Country', drop_first=True).astype(float)
year_dummies = pd.get_dummies(regression_data['Year'], prefix='Year', drop_first=True).astype(float)

# Combine X with fixed effects
X_twoway = X.join(country_dummies).join(year_dummies)

# Add constant
X_twoway = sm.add_constant(X_twoway)

# Run OLS with two-way fixed effects
model_twoway = sm.OLS(y, X_twoway)
results_twoway = model_twoway.fit(cov_type='HC1')

print(results_twoway.summary())

# Display main variables
print("\n" + "="*80)
print("MAIN VARIABLES & INTERACTIONS (Two-Way FE with Lagged ECI)")
print("="*80)
main_vars = independent_vars_expanded + ['const']
for var in main_vars:
    if var in results_twoway.params.index:
        coef = results_twoway.params[var]
        se = results_twoway.bse[var]
        tstat = results_twoway.tvalues[var]
        pval = results_twoway.pvalues[var]
        sig = '***' if pval < 0.01 else '**' if pval < 0.05 else '*' if pval < 0.10 else ''
        print(f"{var:60s} | Coef: {coef:8.4f} | SE: {se:6.4f} | t: {tstat:6.2f} | p: {pval:.4f} {sig}")

print("\n" + "="*80)
print("MODEL FIT")
print("="*80)
print(f"R-squared: {results_twoway.rsquared:.4f}")
print(f"Adjusted R-squared: {results_twoway.rsquared_adj:.4f}")
print(f"N observations: {results_twoway.nobs:.0f}")
print(f"N countries: {regression_data['Country Code'].nunique()}")
print(f"N years: {regression_data['Year'].nunique()}")

                                OLS Regression Results                               
Dep. Variable:     Economic Complexity Index   R-squared:                       0.835
Model:                                   OLS   Adj. R-squared:                  0.823
Method:                        Least Squares   F-statistic:                     114.4
Date:                       Sat, 24 Jan 2026   Prob (F-statistic):               0.00
Time:                               14:08:48   Log-Likelihood:                -150.66
No. Observations:                       1296   AIC:                             491.3
Df Residuals:                           1201   BIC:                             982.2
Df Model:                                 94                                         
Covariance Type:                         HC1                                         
                                                                          coef    std err          z      P>|z|      [0.025      0.975]
----


covariance of constraints does not have full rank. The number of constraints is 95, but rank is 93

