# Phase 2: Advanced Feature Engineering for Economic Shock Resilience

**Building Data Collection Phase**:
- 38 high-quality countries 
- 26 World Bank indicators (100% success) 
- 1,292 observations (1990-2023) 
- 84.8% data coverage 
 
**Phase 2 Objectives**:
1. **Economic Complexity Features**: Implement additonal indicators beyond raw metrics
2. **Shock Resilience Metrics**: Quantify recovery patterns and vulnerability
3. **Temporal Dynamics**: Capture economic cycles and momentum
4. **Institutional Proxies**: Create governance and stability measures
5. **Network Effects**: Economic integration and spillover measures
6. **Innovation Capacity**: Technology adoption and R&D efficiency as possible measures of resilience

In [10]:
# Environment Setup

import sys
import warnings
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
from typing import Dict, List, Tuple, Optional
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy import stats
from scipy.stats import zscore
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 15)

# Load configuration
sys.path.append("config")
from data_collection_config import *

print("PHASE 2: ADVANCED FEATURE ENGINEERING")
print("=" * 55)
print("Building sophisticated economic indicators for modeling and analysis")

PHASE 2: ADVANCED FEATURE ENGINEERING
Building sophisticated economic indicators for modeling and analysis


In [11]:
# Load the final integrated dataset
# ==================================================


print("LOADING FINAL INTEGRATED DATASET")
print("=" * 45)

try:
    # Load the final dataset
    df = pd.read_csv('data/final_integrated_dataset.csv')
    print(f"   Dataset loaded successfully")
    print(f"   Shape: {df.shape}")
    print(f"   Countries: {df['country_code'].nunique()}")
    print(f"   Years: {df['year'].min()}-{df['year'].max()}")
    
    # Basic info
    print(f"\n DATASET OVERVIEW:")
    print(f"   Total observations: {len(df):,}")
    print(f"   Variables: {len(df.columns)}")
    print(f"   Numeric variables: {len(df.select_dtypes(include=[np.number]).columns)}")
    
    # Data quality check
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    overall_completeness = df[numeric_cols].notna().mean().mean()
    print(f"   Data completeness: {overall_completeness:.1%}")
    
    # Show columns
    print(f"\n AVAILABLE VARIABLES:")
    maddison_vars = ['gdp_per_capita', 'population', 'gdp_total', 'log_gdp_per_capita', 'gdp_growth', 'population_growth_maddison']
    wb_vars = [col for col in numeric_cols if col not in maddison_vars + ['year']]
    
    print(f"   Maddison variables ({len(maddison_vars)}): {maddison_vars}")
    print(f"   World Bank variables ({len(wb_vars)}): {wb_vars[:5]}...")  # Show first 5
    
    # Countries representation
    countries_found = sorted(df['country_code'].unique())
    print(f"\n COUNTRIES ({len(countries_found)}):")
    print(f"   {', '.join(countries_found)}")
    
    dataset_loaded = True
    
except Exception as e:
    print(f" Error loading dataset: {e}")
    dataset_loaded = False

if dataset_loaded:
    print(f"\n Ready to proceed with feature engineering!")

LOADING FINAL INTEGRATED DATASET
   Dataset loaded successfully
   Shape: (1292, 32)
   Countries: 38
   Years: 1990-2023

 DATASET OVERVIEW:
   Total observations: 1,292
   Variables: 32
   Numeric variables: 29
   Data completeness: 84.8%

 AVAILABLE VARIABLES:
   Maddison variables (6): ['gdp_per_capita', 'population', 'gdp_total', 'log_gdp_per_capita', 'gdp_growth', 'population_growth_maddison']
   World Bank variables (26): ['bank_capital_assets_ratio', 'domestic_credit_private_gdp', 'exports_gdp', 'fdi_net_inflows_gdp', 'gdp_growth_annual']...

 COUNTRIES (38):
   ARG, AUS, AUT, BEL, BRA, CAN, CHE, CHL, CHN, COL, CZE, DEU, DNK, ESP, FIN, FRA, GBR, HUN, IDN, IND, IRL, ITA, JPN, KOR, MEX, MYS, NLD, NOR, NZL, PHL, POL, PRT, RUS, SWE, THA, TUR, USA, ZAF

 Ready to proceed with feature engineering!


In [12]:
# Create economic indicators - Economic Complexity 
# ====================================================

if dataset_loaded:
    print("CREATING ECONOMIC FEATURES")
    print("=" * 45)
    
    def create_economic_complexity_features(df: pd.DataFrame) -> pd.DataFrame:
        """
        Create advanced economic measures.
        
        Parameters:
        -----------
        df : pd.DataFrame
            Input dataset
            
        Returns:
        --------
        pd.DataFrame : Dataset with economic features added
        """
        
        df_complex = df.copy()
        
        print("  Creating economic indicators...")
        
        # 1. Economic Diversification Index - Diversification 
        if all(col in df.columns for col in ['exports_gdp', 'imports_gdp', 'trade_gdp']):
            df_complex['trade_balance'] = df_complex['exports_gdp'] - df_complex['imports_gdp']
            df_complex['trade_openness'] = df_complex['trade_gdp'] / 100  # Normalize
            df_complex['export_intensity'] = df_complex['exports_gdp'] / df_complex['trade_gdp']
            print("    Trade complexity measures")
        
        # 2. Financial Sophistication Index
        financial_vars = ['domestic_credit_private_gdp', 'market_cap_gdp', 'fdi_net_inflows_gdp']
        available_financial = [var for var in financial_vars if var in df.columns]
        
        if len(available_financial) >= 2:
            # Create financial development index
            financial_data = df_complex[available_financial].fillna(df_complex[available_financial].median())
            scaler = StandardScaler()
            financial_scaled = scaler.fit_transform(financial_data)
            df_complex['financial_development_index'] = np.mean(financial_scaled, axis=1)
            print(f"    Financial sophistication index ({len(available_financial)} components)")
        
        # 3. Innovation Capacity Index  
        innovation_vars = ['research_development_gdp', 'patent_applications_residents', 'tertiary_education_enrollment', 'internet_users_pct']
        available_innovation = [var for var in innovation_vars if var in df.columns]
        
        if len(available_innovation) >= 2:
            innovation_data = df_complex[available_innovation].fillna(df_complex[available_innovation].median())
            # Log transform patent applications (highly skewed)
            if 'patent_applications_residents' in innovation_data.columns:
                innovation_data['patent_applications_residents'] = np.log1p(innovation_data['patent_applications_residents'])
            
            scaler = StandardScaler()
            innovation_scaled = scaler.fit_transform(innovation_data)
            df_complex['innovation_capacity_index'] = np.mean(innovation_scaled, axis=1)
            print(f"    Innovation capacity index ({len(available_innovation)} components)")
        
        # 4. Fiscal Capacity Index
        fiscal_vars = ['tax_revenue_gdp', 'government_expenditure_gdp']
        available_fiscal = [var for var in fiscal_vars if var in df.columns]
        
        if len(available_fiscal) == 2:
            df_complex['fiscal_balance'] = df_complex['tax_revenue_gdp'] - df_complex['government_expenditure_gdp']
            df_complex['fiscal_capacity'] = (df_complex['tax_revenue_gdp'] + df_complex['government_expenditure_gdp']) / 2
            print("    Fiscal capacity measures")
        
        # 5. Economic Structure Sophistication
        # Investment efficiency, how the investment affects gdp_growth
        if all(col in df.columns for col in ['gross_investment_gdp', 'gdp_growth']):
            df_complex['investment_efficiency'] = df_complex['gdp_growth'] / (df_complex['gross_investment_gdp'] + 0.1)  # Avoid division by zero
            print("    Investment efficiency ratio")
        
        # Savings-Investment gap
        if all(col in df.columns for col in ['gross_savings_gdp', 'gross_investment_gdp']):
            df_complex['savings_investment_gap'] = df_complex['gross_savings_gdp'] - df_complex['gross_investment_gdp']
            print("    Savings-investment gap")
        
        # 6. Economic Volatility Measures (rolling windows) ***
        volatility_vars = ['gdp_growth', 'gdp_per_capita_growth']
        for var in volatility_vars:
            if var in df.columns:
                df_complex[f'{var}_volatility_3y'] = (
                    df_complex.groupby('country_code')[var]
                    .rolling(window=3, min_periods=2)
                    .std()
                    .reset_index(0, drop=True)
                )
                df_complex[f'{var}_volatility_5y'] = (
                    df_complex.groupby('country_code')[var]
                    .rolling(window=5, min_periods=3)
                    .std()
                    .reset_index(0, drop=True)
                )
        
        print("    Economic volatility measures")
        
        # 7. Development Level Indicators
        if 'gdp_per_capita' in df.columns:

            # GDP per capita relative to global median by year
            global_median_gdp = df_complex.groupby('year')['gdp_per_capita'].median()
            df_complex['gdp_per_capita_relative'] = df_complex.apply(
                lambda row: row['gdp_per_capita'] / global_median_gdp[row['year']], axis=1
            )
            
            # Development trajectory (5-year growth trend)
            df_complex['gdp_per_capita_trend_5y'] = (
                df_complex.groupby('country_code')['gdp_per_capita']
                .rolling(window=5, min_periods=3)
                .apply(lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) >= 3 else np.nan)
                .reset_index(0, drop=True)
            )
            
            print("    Development level indicators")
        
        new_features = [col for col in df_complex.columns if col not in df.columns]
        print(f"\n   Created {len(new_features)} complexity features:")
        for feature in new_features:
            print(f"      • {feature}")
        
        return df_complex
    
    # Create economic complexity features
    df_with_complexity = create_economic_complexity_features(df)
    print(f"\n Economic complexity features complete!")
    print(f"   Dataset shape: {df_with_complexity.shape}")

CREATING ECONOMIC FEATURES
  Creating economic indicators...
    Trade complexity measures
    Financial sophistication index (3 components)
    Innovation capacity index (4 components)
    Fiscal capacity measures
    Savings-investment gap
    Economic volatility measures
    Development level indicators

   Created 12 complexity features:
      • trade_balance
      • trade_openness
      • export_intensity
      • financial_development_index
      • innovation_capacity_index
      • fiscal_balance
      • fiscal_capacity
      • savings_investment_gap
      • gdp_per_capita_growth_volatility_3y
      • gdp_per_capita_growth_volatility_5y
      • gdp_per_capita_relative
      • gdp_per_capita_trend_5y

 Economic complexity features complete!
   Dataset shape: (1292, 44)


In [13]:
# Create shock resilience indicators
# =======================================================

if dataset_loaded:
    print(" CREATING RESILIENCE FEATURES")
    print("=" * 45)

    
    def create_shock_resilience_features_FIXED(df: pd.DataFrame) -> pd.DataFrame:
        """
        Create shock resilience and recovery metrics - CORRECTED VERSION.
        
        FIXES APPLIED:
        1. Create year-varying targets instead of static country-level ones
        2. Fix resilience variable polarity (higher = more resilient)
        3. Add multiple target options for modeling flexibility
        
        Args:
            df: Input dataframe with economic indicators
            
        Returns:
            DataFrame with resilience features and year-varying targets
        """
        
        df_resilient = df.copy()
        
        print("   Creating  shock resilience indicators...")
        
        # =========================================================================
        # PART 1: TRADITIONAL RESILIENCE METRICS (for context, not main targets)
        # =========================================================================
        
        # we keep the original resilience calculation but as these do not consider time we wont use them as target
        print("      Computing historical resilience context metrics...")
        
        resilience_metrics = []
        
        # For each country, calculate resilience across all shocks 
        for country in df_resilient['country_code'].unique():
            country_data = df_resilient[df_resilient['country_code'] == country].sort_values('year')
            
            country_resilience = {
                'country_code': country,
                'total_shocks_experienced': 0,
                'avg_max_drawdown': 0,
                'avg_recovery_time': 0,
                'avg_recovery_strength': 0,
                'historical_resilience_score': 0,  # Renamed for clarity
                'historical_vulnerability_score': 0
            }
            
            shock_impacts = []
            
            # Analyze each major shock 
            for shock_name, shock_info in MAJOR_SHOCKS.items():
                pre_shock_years = range(max(shock_info['start'] - 3, 1990), shock_info['start'])
                pre_shock_data = country_data[country_data['year'].isin(pre_shock_years)]
                
                if len(pre_shock_data) >= 2 and 'gdp_per_capita' in pre_shock_data.columns:
                    baseline_gdp = pre_shock_data['gdp_per_capita'].mean()
                    
                    shock_years = range(shock_info['start'], shock_info['end'] + 1)
                    shock_data = country_data[country_data['year'].isin(shock_years)]
                    
                    if len(shock_data) > 0 and baseline_gdp > 0:
                        min_gdp_during_shock = shock_data['gdp_per_capita'].min()
                        max_drawdown = (baseline_gdp - min_gdp_during_shock) / baseline_gdp
                        
                        recovery_years = range(shock_info['end'] + 1, min(shock_info['end'] + 6, 2024))
                        recovery_data = country_data[country_data['year'].isin(recovery_years)]
                        
                        recovery_time = None
                        recovery_strength = 0
                        
                        if len(recovery_data) > 0:
                            for _, row in recovery_data.iterrows():
                                if row['gdp_per_capita'] >= baseline_gdp:
                                    recovery_time = row['year'] - shock_info['end']
                                    break
                            
                            if 'gdp_growth' in recovery_data.columns:
                                recovery_strength = recovery_data['gdp_growth'].mean()
                        
                        shock_impacts.append({
                            'shock': shock_name,
                            'max_drawdown': max_drawdown,
                            'recovery_time': recovery_time,
                            'recovery_strength': recovery_strength,
                            'baseline_gdp': baseline_gdp
                        })
            
            # Aggregate resilience metrics 
            if shock_impacts:
                country_resilience['total_shocks_experienced'] = len(shock_impacts)
                country_resilience['avg_max_drawdown'] = np.mean([s['max_drawdown'] for s in shock_impacts])
                
                recovery_times = [s['recovery_time'] for s in shock_impacts if s['recovery_time'] is not None]
                if recovery_times:
                    country_resilience['avg_recovery_time'] = np.mean(recovery_times)
                
                recovery_strengths = [s['recovery_strength'] for s in shock_impacts if not np.isnan(s['recovery_strength'])]
                if recovery_strengths:
                    country_resilience['avg_recovery_strength'] = np.mean(recovery_strengths)

                
                # FIXED RESILIENCE SCORE CALCULATION - CORRECT POLARITY
                resilience_components = []
                if country_resilience['avg_max_drawdown'] != 0:
                    # Lower drawdown = higher resilience (positive)
                    resilience_components.append(1 - country_resilience['avg_max_drawdown'])  # FIXED: make sure is positive
                if country_resilience['avg_recovery_time'] > 0:
                    # Faster recovery = higher resilience (invert and normalize)
                    resilience_components.append(1 - min(country_resilience['avg_recovery_time'] / 10, 1))  # FIXED: Proper normalization
                if country_resilience['avg_recovery_strength'] != 0:
                    # Stronger recovery = higher resilience (normalize to 0-1)
                    resilience_components.append(min(max(country_resilience['avg_recovery_strength'] / 5, 0), 1))  # FIXED: Proper scaling
                
                if resilience_components:
                    country_resilience['historical_resilience_score'] = np.mean(resilience_components)
                    country_resilience['historical_vulnerability_score'] = 1 - country_resilience['historical_resilience_score']
            
            resilience_metrics.append(country_resilience)
        
        # Convert to DataFrame and merge (same as before)
        resilience_df = pd.DataFrame(resilience_metrics)
        df_resilient = df_resilient.merge(resilience_df, on='country_code', how='left')
        
        print(f"      Historical resilience metrics calculated")
        
        # =========================================================================
        # PART 2: YEAR-VARYING MODELING TARGETS (THE KEY FIX!)
        # This is probably the most important piece of this project
        # =========================================================================
        
        print("      Creating year-varying modeling targets...")
        
        # TARGET 1: Economic Growth Stability (varies by year)
        # Inverse coefficient of variation of GDP growth (more stable = higher score)
        # Shows the inverse link between volatility and long‑run prosperity—direct support for treating stability as resilience

        df_resilient['growth_stability_target'] = (
            df_resilient.groupby('country_code')['gdp_growth_annual']
            .rolling(window=3, min_periods=2)
            .apply(lambda x: 1 / (np.std(x) / (np.abs(np.mean(x)) + 0.01) + 0.01))
            .reset_index(0, drop=True)
        )
        
        # TARGET 2: Economic Performance Index (varies by year) 
        # Multi-dimensional performance relative to peers in same year

        performance_indicators = [
            'gdp_per_capita_growth', 'gross_investment_gdp', 'gross_savings_gdp'
        ]
        
        available_indicators = [col for col in performance_indicators if col in df_resilient.columns]
        
        if len(available_indicators) >= 2:
            performance_components = []
            
            for indicator in available_indicators:
                # Year-wise percentile ranking (0-1 scale)
                yearly_percentile = (
                    df_resilient.groupby('year')[indicator]
                    .transform(lambda x: x.rank(pct=True))
                )
                performance_components.append(yearly_percentile)
            
            df_resilient['economic_performance_target'] = np.mean(performance_components, axis=0)
        
        # TARGET 3: Relative Development Position (varies by year)
        # Position relative to global development in each year

        if 'gdp_per_capita' in df_resilient.columns:
            global_median_gdp = df_resilient.groupby('year')['gdp_per_capita'].median()
            df_resilient['relative_development_target'] = df_resilient.apply(
                lambda row: row['gdp_per_capita'] / global_median_gdp[row['year']] 
                if row['year'] in global_median_gdp.index else np.nan, axis=1
            )
        
        # TARGET 4: Investment Efficiency (varies by year)
        # GDP growth per unit of investment
        if all(col in df_resilient.columns for col in ['gdp_growth_annual', 'gross_investment_gdp']):
            df_resilient['investment_efficiency_target'] = (
                (df_resilient['gdp_growth_annual'] + 2) / (df_resilient['gross_investment_gdp'] + 1)
            )  # Add constants to avoid negatives/zeros
        
        # TARGET 5: Economic Resilience Composite (varies by year)
        # Combines stability, performance, and efficiency

        target_components = []
        
        if 'growth_stability_target' in df_resilient.columns:
            # Normalize to 0-1 scale
            stability_norm = df_resilient['growth_stability_target'] / df_resilient['growth_stability_target'].quantile(0.95)
            stability_norm = stability_norm.clip(0, 1)
            target_components.append(stability_norm)
        
        if 'economic_performance_target' in df_resilient.columns:
            target_components.append(df_resilient['economic_performance_target'])
        
        if 'investment_efficiency_target' in df_resilient.columns:
            # Normalize to 0-1 scale
            efficiency_norm = df_resilient['investment_efficiency_target'] / df_resilient['investment_efficiency_target'].quantile(0.95)
            efficiency_norm = efficiency_norm.clip(0, 1)
            target_components.append(efficiency_norm)
        
        if target_components:
            df_resilient['composite_resilience_target'] = np.mean(target_components, axis=0)
        
        print(f"      Year-varying modeling targets created")
        
        # =========================================================================
        # PART 3: TEMPORAL DYNAMICS 
        # =========================================================================
        
        print("      Adding temporal dynamics...")
        
        # Economic stability indicators 
        stability_vars = ['gdp_growth_annual', 'gdp_per_capita_growth']
        for var in stability_vars:
            if var in df_resilient.columns:
                # 3-year coefficient of variation 
                df_resilient[f'{var}_stability_3y'] = (
                    df_resilient.groupby('country_code')[var]
                    .rolling(window=3, min_periods=2)
                    .apply(lambda x: np.std(x) / (np.abs(np.mean(x)) + 0.01))
                    .reset_index(0, drop=True)
                )
        
        # Pre-shock vulnerability 
        df_resilient['pre_shock_vulnerability'] = 0
        
        for idx, row in df_resilient.iterrows():
            year = row['year']
            vulnerability_factors = []
            
            for shock_name, shock_info in MAJOR_SHOCKS.items():
                if year == shock_info['start'] - 1:  # Year before shock
                    if not pd.isna(row.get('government_debt_gdp', np.nan)):
                        if row['government_debt_gdp'] > 60:
                            vulnerability_factors.append(1)
                    
                    if not pd.isna(row.get('gross_savings_gdp', np.nan)):
                        if row['gross_savings_gdp'] < 15:
                            vulnerability_factors.append(1)
                    
                    if not pd.isna(row.get('trade_gdp', np.nan)):
                        if row['trade_gdp'] > 100:
                            vulnerability_factors.append(0.5)
            
            if vulnerability_factors:
                df_resilient.at[idx, 'pre_shock_vulnerability'] = np.mean(vulnerability_factors)
        
        # Recovery momentum (same as before)
        for var in ['gdp_per_capita', 'gdp_growth_annual']:
            if var in df_resilient.columns:
                df_resilient[f'{var}_momentum'] = (
                    df_resilient.groupby('country_code')[var]
                    .diff(2)
                )
        
        print(f"      Temporal dynamics added")
        
        # =========================================================================
        # PART 4: SUMMARY AND RECOMMENDATIONS
        # =========================================================================
        
        new_features = [col for col in df_resilient.columns if col not in df.columns]
        
        print(f"\n   Created {len(new_features)} resilience features:")
        
        # Separate targets from other features
        target_features = [f for f in new_features if 'target' in f]
        other_features = [f for f in new_features if 'target' not in f]
        
        print(f"      MODELING TARGETS ({len(target_features)}):")
        for feature in target_features:
            target_data = df_resilient[feature].dropna()
            if len(target_data) > 0:
                # Check temporal variation
                temporal_var = (
                    df_resilient.groupby('country_code')[feature]
                    .apply(lambda x: x.std() > 0.01 if len(x) > 1 else False)
                    .mean()
                )
                temp_status = "Varies" if temporal_var > 0.5 else "⚠️ Static"
                print(f"         • {feature}: Range[{target_data.min():.3f}, {target_data.max():.3f}], {temp_status}")
        
        print(f"      OTHER FEATURES ({len(other_features)}):")
        for feature in other_features[:11]:  # Show first 5
            print(f"         • {feature}")
        if len(other_features) > 5:
            print(f"         ... and {len(other_features) - 5} more")
        
        # RECOMMENDATIONS
        print(f"\n   MODELING RECOMMENDATIONS:")
        print(f"     PRIMARY TARGET: growth_stability_target")
        print(f"     suitable for ML (good variance + temporal variation)")
        print(f"     ALTERNATIVE 1: economic_performance_target") 
        print(f"     Year-wise relative performance measure")
        print(f"     ALTERNATIVE 2: composite_resilience_target")
        print(f"     Multi-dimensional resilience measure")
        
        return df_resilient
    
# Create resilience features
df_with_resilience = create_shock_resilience_features_FIXED(df_with_complexity)
print(f"\n Shock resilience metrics complete!")
print(f"    Dataset shape: {df_with_resilience.shape}")

 CREATING RESILIENCE FEATURES
   Creating  shock resilience indicators...
      Computing historical resilience context metrics...
      Historical resilience metrics calculated
      Creating year-varying modeling targets...
      Year-varying modeling targets created
      Adding temporal dynamics...
      Temporal dynamics added

   Created 16 resilience features:
      MODELING TARGETS (5):
         • growth_stability_target: Range[0.005, 81.532], Varies
         • economic_performance_target: Range[0.035, 1.000], Varies
         • relative_development_target: Range[0.108, 2.643], Varies
         • investment_efficiency_target: Range[-0.750, 0.984], Varies
         • composite_resilience_target: Range[0.016, 0.974], Varies
      OTHER FEATURES (11):
         • total_shocks_experienced
         • avg_max_drawdown
         • avg_recovery_time
         • avg_recovery_strength
         • historical_resilience_score
         • historical_vulnerability_score
         • gdp_growth_annual_

In [14]:
# Create temporal dynamics and cyclical features

if dataset_loaded:
    print(" CREATING TEMPORAL & CYCLICAL FEATURES")
    print("=" * 45)
    
    def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
        """
        Create sophisticated temporal and cyclical economic features.
        """
        
        df_temporal = df.copy().sort_values(['country_code', 'year'])
        
        print("   Creating temporal dynamics...")
        
        # 1. Economic cycle indicators
        if 'gdp_growth' in df_temporal.columns:
            # Detect economic cycles using rolling windows
            df_temporal['gdp_growth_ma_3y'] = (
                df_temporal.groupby('country_code')['gdp_growth']
                .rolling(window=3, center=True, min_periods=2)
                .mean()
                .reset_index(0, drop=True)
            )
            
            df_temporal['gdp_growth_ma_5y'] = (
                df_temporal.groupby('country_code')['gdp_growth']
                .rolling(window=5, center=True, min_periods=3)
                .mean()
                .reset_index(0, drop=True)
            )
            
            # Cycle position (above/below trend)
            df_temporal['gdp_cycle_position'] = (
                df_temporal['gdp_growth'] - df_temporal['gdp_growth_ma_5y']
            )
            
            # Expansion/contraction phases
            df_temporal['in_expansion'] = (df_temporal['gdp_growth'] > 0).astype(int)
            df_temporal['in_recession'] = (df_temporal['gdp_growth'] < -1).astype(int)  # Technical recession threshold
            
            print("      Economic cycle indicators")
        
        # 2. Trend and momentum features
        trend_vars = ['gdp_per_capita', 'trade_gdp', 'gross_investment_gdp']
        
        for var in trend_vars:
            if var in df_temporal.columns:
                # Linear trend over 5 years
                df_temporal[f'{var}_trend_5y'] = (
                    df_temporal.groupby('country_code')[var]
                    .rolling(window=5, min_periods=3)
                    .apply(lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) >= 3 else np.nan)
                    .reset_index(0, drop=True)
                )
                
                # Acceleration (second derivative)
                df_temporal[f'{var}_acceleration'] = (
                    df_temporal.groupby('country_code')[f'{var}_trend_5y']
                    .diff()
                )
        
        print("      Trend and momentum features")
        
        # 3. Lag features (economic conditions persistence)
        lag_vars = ['gdp_growth', 'gross_investment_gdp', 'unemployment_total']
        
        for var in lag_vars:
            if var in df_temporal.columns:
                # 1-year and 2-year lags
                df_temporal[f'{var}_lag1'] = (
                    df_temporal.groupby('country_code')[var].shift(1)
                )
                df_temporal[f'{var}_lag2'] = (
                    df_temporal.groupby('country_code')[var].shift(2)
                )
        
        print("      Lag features")
        
        # 4. Convergence/divergence indicators
        if 'gdp_per_capita' in df_temporal.columns:
            # Global convergence (catching up to rich countries)
            annual_top_decile = (
                df_temporal.groupby('year')['gdp_per_capita']
                .quantile(0.9)
                .to_dict()
            )
            
            df_temporal['convergence_gap'] = df_temporal.apply(
                lambda row: annual_top_decile[row['year']] / (row['gdp_per_capita'] + 1), axis=1
            )
            
            # Convergence speed (how fast the gap is closing)
            df_temporal['convergence_speed'] = (
                df_temporal.groupby('country_code')['convergence_gap']
                .diff(-1)  # Negative diff (gap should decrease)
            )
            
            print("      Convergence indicators")
        
        # 5. Crisis proximity features
        df_temporal['years_to_next_shock'] = np.inf
        df_temporal['years_since_last_shock'] = np.inf
        
        for idx, row in df_temporal.iterrows():
            year = row['year']
            
            # Find next shock
            next_shock_years = []
            last_shock_years = []
            
            for shock_name, shock_info in MAJOR_SHOCKS.items():
                if shock_info['start'] > year:
                    next_shock_years.append(shock_info['start'] - year)
                if shock_info['end'] < year:
                    last_shock_years.append(year - shock_info['end'])
            
            if next_shock_years:
                df_temporal.at[idx, 'years_to_next_shock'] = min(next_shock_years)
            if last_shock_years:
                df_temporal.at[idx, 'years_since_last_shock'] = min(last_shock_years)
        
        # Cap at reasonable values
        df_temporal['years_to_next_shock'] = np.clip(df_temporal['years_to_next_shock'], 0, 10)
        df_temporal['years_since_last_shock'] = np.clip(df_temporal['years_since_last_shock'], 0, 10)
        
        print("      Crisis proximity features")
        
        # 6. Decade and period effects
        df_temporal['decade'] = (df_temporal['year'] // 10) * 10
        df_temporal['post_cold_war'] = (df_temporal['year'] >= 1991).astype(int)
        df_temporal['globalization_era'] = (df_temporal['year'] >= 1995).astype(int)
        df_temporal['post_2008_crisis'] = (df_temporal['year'] >= 2009).astype(int)
        df_temporal['post_covid'] = (df_temporal['year'] >= 2021).astype(int)
        
        print("      Period effect indicators")
        
        new_temporal_features = [col for col in df_temporal.columns if col not in df.columns]
        print(f"\n   Created {len(new_temporal_features)} temporal features:")
        for feature in new_temporal_features[:8]:  # Show first 8
            print(f"      • {feature}")
        if len(new_temporal_features) > 8:
            print(f"      ... and {len(new_temporal_features) - 8} more")
        
        return df_temporal
    
    # Create temporal features
    df_with_temporal = create_temporal_features(df_with_resilience)
    print(f"\n Temporal features complete!")
    print(f"   Dataset shape: {df_with_temporal.shape}")

 CREATING TEMPORAL & CYCLICAL FEATURES
   Creating temporal dynamics...
      Trend and momentum features
      Lag features
      Convergence indicators
      Crisis proximity features
      Period effect indicators

   Created 18 temporal features:
      • gdp_per_capita_acceleration
      • trade_gdp_trend_5y
      • trade_gdp_acceleration
      • gross_investment_gdp_trend_5y
      • gross_investment_gdp_acceleration
      • gross_investment_gdp_lag1
      • gross_investment_gdp_lag2
      • unemployment_total_lag1
      ... and 10 more

 Temporal features complete!
   Dataset shape: (1292, 78)


In [15]:

# Final feature engineering summary and save
if dataset_loaded:
    print(" FEATURE ENGINEERING SUMMARY & SAVE")
    print("=" * 45)
    
    # Calculate feature engineering impact
    original_shape = df.shape
    final_shape = df_with_temporal.shape
    features_added = final_shape[1] - original_shape[1]
    
    print(f"   FEATURE ENGINEERING IMPACT:")
    print(f"   Original dataset: {original_shape}")
    print(f"   Final dataset: {final_shape}")
    print(f"   Features added: {features_added}")
    print(f"   Feature expansion: {features_added/original_shape[1]:.1%}")
    
    # Data quality assessment
    numeric_cols = df_with_temporal.select_dtypes(include=[np.number]).columns
    final_completeness = df_with_temporal[numeric_cols].notna().mean().mean()
    
    print(f"   DATA QUALITY:")
    print(f"   Total numeric features: {len(numeric_cols)}")
    print(f"   Overall completeness: {final_completeness:.1%}")
    
    # Feature categories summary
    feature_categories = {
        'Original': [col for col in df.columns if col in df_with_temporal.columns],
        'Complexity': [col for col in df_with_temporal.columns if any(x in col for x in ['_index', 'efficiency', 'balance', 'relative'])],
        'Resilience': [col for col in df_with_temporal.columns if any(x in col for x in ['resilience', 'vulnerability', 'drawdown', 'recovery'])],
        'Temporal': [col for col in df_with_temporal.columns if any(x in col for x in ['trend', 'lag', 'momentum', 'cycle', 'ma_'])],
        'Volatility': [col for col in df_with_temporal.columns if 'volatility' in col or 'stability' in col],
        'Period_Effects': [col for col in df_with_temporal.columns if any(x in col for x in ['post_', 'era', 'decade', 'years_'])]
    }
    
    print(f"\n  FEATURE CATEGORIES:")
    for category, features in feature_categories.items():
        print(f"   {category}: {len(features)} features")
    
    # Save the engineered dataset
    df_with_temporal.to_csv('data/engineered_features_dataset.csv', index=False)
    print(f"\n SAVED: data/engineered_features_dataset.csv")
    
    # Create feature catalog
    feature_catalog = []
    for category, features in feature_categories.items():
        for feature in features:
            feature_catalog.append({
                'feature_name': feature,
                'category': category,
                'data_type': str(df_with_temporal[feature].dtype),
                'missing_pct': df_with_temporal[feature].isna().mean() * 100,
                'description': f"{category} feature: {feature}"
            })
    
    feature_catalog_df = pd.DataFrame(feature_catalog)
    feature_catalog_df.to_csv('data/feature_catalog.csv', index=False)
    print(f" SAVED: data/feature_catalog.csv")
    
    # Show top features by completeness
    completeness_by_feature = df_with_temporal[numeric_cols].notna().mean().sort_values(ascending=False)
    
    print(f"\n TOP 10 FEATURES BY COMPLETENESS:")
    for feature, completeness in completeness_by_feature.head(10).items():
        print(f"   {feature}: {completeness:.1%}")
    
    print(f"\n  FEATURES NEEDING ATTENTION (high missing data):")
    low_completeness = completeness_by_feature[completeness_by_feature < 0.5]
    if len(low_completeness) > 0:
        for feature, completeness in low_completeness.head(5).items():
            print(f"   {feature}: {completeness:.1%}")
    else:
        print("   None - all features have >50% completeness!")
    
    print(f"\n  READY FOR NEXT PHASES : EDA & ADVANCED MODELING!")
    print(f"   Engineered dataset ready for ML pipeline")
    print(f"   {len(numeric_cols)} features for sophisticated modeling")
    print(f"   Rich feature set across all economic dimensions")
    print(f"\n  CORRECTED TARGET VARIABLES FOR MODELING:")
    print(f"   PRIMARY: growth_stability_target")
    print(f"      → Year-varying economic stability measure")
    print(f"      → Higher values = more stable growth patterns")
    print(f"   SECONDARY: economic_performance_target")
    print(f"      → Relative performance vs. global peers each year")
    print(f"   COMPOSITE: composite_resilience_target")
    print(f"      → Multi-dimensional resilience score")
    print(f"   AVOID: resilience_score, vulnerability_score")
    print(f"      → Static country-level metrics (not suitable for ML)")
    engineering_success = True

else:
    engineering_success = False
    print(" Feature engineering skipped due to data loading issues")

print(f"\n PHASE 2 COMPLETE!")

 FEATURE ENGINEERING SUMMARY & SAVE
   FEATURE ENGINEERING IMPACT:
   Original dataset: (1292, 32)
   Final dataset: (1292, 78)
   Features added: 46
   Feature expansion: 143.8%
   DATA QUALITY:
   Total numeric features: 75
   Overall completeness: 91.7%

  FEATURE CATEGORIES:
   Original: 32 features
   Complexity: 7 features
   Resilience: 7 features
   Temporal: 9 features
   Volatility: 5 features
   Period_Effects: 12 features

 SAVED: data/engineered_features_dataset.csv
 SAVED: data/feature_catalog.csv

 TOP 10 FEATURES BY COMPLETENESS:
   year: 100.0%
   historical_vulnerability_score: 100.0%
   innovation_capacity_index: 100.0%
   total_shocks_experienced: 100.0%
   avg_max_drawdown: 100.0%
   avg_recovery_time: 100.0%
   avg_recovery_strength: 100.0%
   historical_resilience_score: 100.0%
   pre_shock_vulnerability: 100.0%
   urban_population_pct: 100.0%

  FEATURES NEEDING ATTENTION (high missing data):
   bank_capital_assets_ratio: 43.6%
   private_investment_gdp: 24.8%
 

In [16]:
# Target Variable Analysis - Verify the fixes worked

if dataset_loaded and engineering_success:
    print(" TARGET VARIABLE ANALYSIS")
    print("=" * 50)
    
    target_vars = [col for col in df_with_temporal.columns if 'target' in col]
    
    print(f" AVAILABLE TARGETS ({len(target_vars)}):")
    
    for target in target_vars:
        target_data = df_with_temporal[target].dropna()
        
        if len(target_data) > 0:
            # Basic statistics
            stats = {
                'count': len(target_data),
                'mean': target_data.mean(),
                'std': target_data.std(),
                'min': target_data.min(),
                'max': target_data.max(),
                'range': target_data.max() - target_data.min()
            }
            
            # Temporal variation check
            temporal_variation = (
                df_with_temporal.groupby('country_code')[target]
                .apply(lambda x: x.std() > 0.01 if len(x) > 1 else False)
                .mean()
            )
            
            # Cross-country variation check  
            country_means = df_with_temporal.groupby('country_code')[target].mean()
            cross_country_var = country_means.std() / (country_means.mean() + 0.01)
            
            print(f"\n   {target.upper()}:")
            print(f"      Statistics: μ={stats['mean']:.3f}, σ={stats['std']:.3f}, range=[{stats['min']:.3f}, {stats['max']:.3f}]")
            print(f"      Temporal variation: {temporal_variation:.1%} of countries show time variation")
            print(f"      Cross-country CV: {cross_country_var:.3f}")
            
            # Suitability assessment
            if temporal_variation > 0.3 and cross_country_var > 0.1 and stats['range'] > 0.1:
                suitability = " EXCELLENT for ML modeling"
            elif temporal_variation > 0.2 and cross_country_var > 0.05:
                suitability = " GOOD for ML modeling"  
            else:
                suitability = " NOT SUITABLE for ML modeling"
            
            print(f"      Modeling suitability: {suitability}")
    
    print(f"\n TARGET ANALYSIS COMPLETE!")
    print(f"   Use 'growth_stability_target' as primary y variable")
    print(f"   Expect positive R² scores with proper temporal patterns")

 TARGET VARIABLE ANALYSIS
 AVAILABLE TARGETS (5):

   GROWTH_STABILITY_TARGET:
      Statistics: μ=4.658, σ=7.407, range=[0.005, 81.532]
      Temporal variation: 100.0% of countries show time variation
      Cross-country CV: 0.649
      Modeling suitability:  EXCELLENT for ML modeling

   ECONOMIC_PERFORMANCE_TARGET:
      Statistics: μ=0.513, σ=0.226, range=[0.035, 1.000]
      Temporal variation: 100.0% of countries show time variation
      Cross-country CV: 0.347
      Modeling suitability:  EXCELLENT for ML modeling

   RELATIVE_DEVELOPMENT_TARGET:
      Statistics: μ=0.961, σ=0.521, range=[0.108, 2.643]
      Temporal variation: 100.0% of countries show time variation
      Cross-country CV: 0.532
      Modeling suitability:  EXCELLENT for ML modeling

   INVESTMENT_EFFICIENCY_TARGET:
      Statistics: μ=0.191, σ=0.144, range=[-0.750, 0.984]
      Temporal variation: 100.0% of countries show time variation
      Cross-country CV: 0.245
      Modeling suitability:  EXCELLENT for