Market Efficiency Analysis for Prediction Markets
----------------------------------------------
This notebook analyzes the efficiency of prediction markets using various statistical tests to evaluate
whether these markets follow the "wisdom of crowds" hypothesis.
## 1. Introduction & Research Questions

This analysis aims to answer the following research questions:
1. Do prediction markets on Polymarket exhibit weak-form efficiency?
2. How does efficiency vary across different market types and contexts?
3. Does efficiency change over a market's lifecycle?
4. Can one market predict price movements in related markets?

Efficient markets should have the following characteristics:
- Non-stationary price series (random walk)
- Stationary return series
- No significant autocorrelation in returns
- No significant predictability through AR models
- Variance ratios close to 1



## 2. Setup & Data Loading

In [None]:
# Import statements and setup (keep the existing imports)
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import acf, pacf, adfuller
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import grangercausalitytests
import warnings
from tqdm.auto import tqdm
import json
from typing import Dict, List, Optional, Union, Tuple

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Add the src directory to the path if it isn't already there
if '../src' not in sys.path:
    sys.path.append('../src')

# Import utility functions
from src.utils.data_loader import load_main_dataset, load_trade_data, get_sample_market_ids, load_market_question_mapping

# Create output directory for saving results
results_dir = 'results/knowledge_value/efficiency'
os.makedirs(results_dir, exist_ok=True)

# Create a cache for market data to avoid reloading
market_data_cache = {}

 ## 3. Load and Explore the Dataset

In [None]:
# Load main dataset more efficiently
def load_data(data_path='data/cleaned_election_data.csv', verbose=True):
    """
    Load the main dataset and prepare market mappings efficiently
    
    Returns:
    --------
    tuple
        (main_df, id_column, market_questions)
    """
    if verbose:
        print("Loading main dataset...")
    
    # Load with optimized settings
    main_df = load_main_dataset(data_path)
    
    # Determine ID column
    id_column = None
    for col in ['market_id', 'id']:
        if col in main_df.columns:
            id_column = col
            break
    
    if id_column is None:
        id_column = main_df.columns[0]
        if verbose:
            print(f"Using {id_column} as market identifier column")
    elif verbose:
        print(f"Using {id_column} as market identifier column")
    
    # Create mappings
    try:
        market_questions = dict(zip(main_df[id_column].astype(str), main_df['question']))
        if verbose:
            print(f"Created mapping for {len(market_questions)} markets")
    except:
        # Try loading from file if column doesn't exist
        try:
            market_questions = load_market_question_mapping()
            if verbose:
                print(f"Loaded mapping from file with {len(market_questions)} markets")
        except:
            market_questions = {}
            if verbose:
                print("Could not create market question mapping")
    
    return main_df, id_column, market_questions

# Load data
main_df, id_column, market_questions = load_data()

### 3.1 Select Markets for Analysis

In [None]:
# Get a list of market IDs for analysis
sort_column = 'volumeNum' if 'volumeNum' in main_df.columns else id_column
sample_markets = main_df.sort_values(sort_column, ascending=False)[id_column].unique()

# Limit the number of markets for initial analysis
analysis_markets = sample_markets[:10]  # Adjust based on your computational resources
print(f"\nSelected {len(analysis_markets)} markets for analysis")

## 4. Efficiency Tests
### 4.1 Market Data Preprocessing


In [None]:
def preprocess_market_data(market_id, trades_dir='data/trades', resample='1min', verbose=False):
    """
    Efficiently convert raw trade data to time series of prices and returns.
    
    Parameters:
    -----------
    market_id : str or int
        The ID of the market to analyze
    trades_dir : str
        Path to the trades directory
    resample : str
        Frequency to resample the time series (default: '1min')
    verbose : bool
        Whether to print detailed output
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: price, log_return
        or None if processing fails
    """
    # Check cache first
    cache_key = (str(market_id), resample)
    if cache_key in market_data_cache:
        return market_data_cache[cache_key]
    
    # Load trade data
    trades_df = load_trade_data(market_id, trades_dir=trades_dir)
    
    if trades_df is None or len(trades_df) < 30:
        if verbose:
            print(f"Insufficient trade data for market {market_id}")
        return None
    
    try:
        # Ensure timestamp is datetime
        if not pd.api.types.is_datetime64_any_dtype(trades_df['timestamp']):
            trades_df['timestamp'] = pd.to_datetime(trades_df['timestamp'])
        
        # Sort by timestamp
        trades_df = trades_df.sort_values('timestamp')
        
        # Find price column
        price_col = None
        for col in ['price', 'price_num']:
            if col in trades_df.columns:
                price_col = col
                break
        
        if price_col is None:
            if verbose:
                print(f"No price column found for market {market_id}")
            return None
        
        # Ensure price is numeric
        trades_df[price_col] = pd.to_numeric(trades_df[price_col], errors='coerce')
        
        if verbose:
            print(f"Processing market {market_id}")
            print(f"Total trades: {len(trades_df)}")
            print(f"Time span: {trades_df['timestamp'].min()} to {trades_df['timestamp'].max()}")
        
        # Drop NaN prices and set timestamp as index
        trades_df = trades_df.dropna(subset=[price_col])
        trades_df = trades_df.set_index('timestamp')
        
        # Efficiently resample price series
        price_series = trades_df[price_col].resample(resample).last()
        price_series = price_series.ffill()
        
        # Calculate log returns
        log_returns = np.log(price_series / price_series.shift(1))
        
        # Create result DataFrame and drop NaNs
        result_df = pd.DataFrame({
            'price': price_series,
            'log_return': log_returns
        }).dropna()
        
        # Cache the result
        market_data_cache[cache_key] = result_df
        
        return result_df
        
    except Exception as e:
        if verbose:
            print(f"Error processing market {market_id}: {str(e)}")
        return None


### 4.2 Define Efficiency Tests

In [None]:
def run_efficiency_tests(market_data, verbose=False):
    """
    Run a complete suite of market efficiency tests on processed data
    
    Parameters:
    -----------
    market_data : pd.DataFrame
        DataFrame with price and log_return columns
    verbose : bool
        Whether to print detailed output
        
    Returns:
    --------
    dict
        Dictionary with test results
    """
    if market_data is None or len(market_data) < 30:
        return None
    
    results = {}
    
    # ADF tests
    try:
        # Price stationarity (random walk test)
        price_adf = adfuller(market_data['price'].dropna())
        results['adf_price'] = {
            'adf_statistic': price_adf[0],
            'pvalue': price_adf[1],
            'critical_values': price_adf[4],
            'is_stationary': price_adf[1] < 0.05
        }
        
        # Return stationarity
        return_adf = adfuller(market_data['log_return'].dropna())
        results['adf_return'] = {
            'adf_statistic': return_adf[0],
            'pvalue': return_adf[1],
            'critical_values': return_adf[4],
            'is_stationary': return_adf[1] < 0.05
        }
    except Exception as e:
        if verbose:
            print(f"Error in ADF tests: {str(e)}")
    
    # Autocorrelation tests
    try:
        returns = market_data['log_return']
        acf_values = acf(returns, nlags=10, fft=True)
        significance_level = 1.96 / np.sqrt(len(returns))
        
        significant_lags = [i for i in range(1, len(acf_values)) 
                         if abs(acf_values[i]) > significance_level]
        
        results['autocorrelation'] = {
            'acf_values': acf_values.tolist(),
            'significant_lags': significant_lags,
            'has_significant_autocorrelation': len(significant_lags) > 0
        }
    except Exception as e:
        if verbose:
            print(f"Error in autocorrelation tests: {str(e)}")
    
    # Runs test
    try:
        # Convert returns to binary sequence
        binary_seq = (market_data['log_return'] > 0).astype(int)
        
        # Count runs
        runs = 1
        for i in range(1, len(binary_seq)):
            if binary_seq.iloc[i] != binary_seq.iloc[i-1]:
                runs += 1
        
        # Calculate expected runs
        n = len(binary_seq)
        n1 = binary_seq.sum()
        n0 = n - n1
        
        if n0 > 0 and n1 > 0:
            expected_runs = 1 + 2 * n1 * n0 / n
            std_runs = np.sqrt(2 * n1 * n0 * (2 * n1 * n0 - n) / (n**2 * (n-1)))
            
            # Calculate z-statistic
            z_stat = (runs - expected_runs) / std_runs
            p_value = 2 * (1 - abs(np.exp(-0.5 * z_stat**2) / np.sqrt(2 * np.pi)))
            
            results['runs_test'] = {
                'runs': runs,
                'expected_runs': expected_runs,
                'z_statistic': z_stat,
                'p_value': p_value,
                'is_random': p_value >= 0.05
            }
        else:
            results['runs_test'] = {
                'runs': runs,
                'is_random': False
            }
    except Exception as e:
        if verbose:
            print(f"Error in runs test: {str(e)}")
    
    # Variance ratio test
    try:
        returns = market_data['log_return']
        base_var = returns.var()
        vr_results = {}
        
        for period in [5, 15, 30]:
            if len(returns) < period * 10:
                continue
                
            # Aggregate returns
            agg_returns = returns.rolling(window=period).sum().dropna()
            
            if len(agg_returns) <= 1:
                continue
                
            # Calculate variance ratio
            period_var = agg_returns.var()
            var_ratio = period_var / (period * base_var)
            
            # Test significance
            n = len(returns)
            std_error = np.sqrt(2 * (2 * period - 1) * (period - 1) / (3 * period * n))
            z_stat = (var_ratio - 1) / std_error
            p_value = 2 * (1 - abs(np.exp(-0.5 * z_stat**2) / np.sqrt(2 * np.pi)))
            
            vr_results[f"{period}min"] = {
                'variance_ratio': var_ratio,
                'z_statistic': z_stat,
                'p_value': p_value,
                'significant': p_value < 0.05,
                'interpretation': 'Mean Reversion' if var_ratio < 1 else 'Momentum' if var_ratio > 1 else 'Random Walk'
            }
        
        results['variance_ratio'] = vr_results
    except Exception as e:
        if verbose:
            print(f"Error in variance ratio test: {str(e)}")
    
    # AR model
    try:
        returns = market_data['log_return']
        if len(returns) > 5:
            model = AutoReg(returns, lags=1)
            model_fit = model.fit()
            
            # Extract coefficient and p-value
            coef = model_fit.params[1] if len(model_fit.params) > 1 else 0
            p_value = model_fit.pvalues[1] if len(model_fit.pvalues) > 1 else 1
            
            results['ar_model'] = {
                'ar_coefficient': coef,
                'p_value': p_value,
                'significant': p_value < 0.05,
                'aic': model_fit.aic,
                'bic': model_fit.bic
            }
    except Exception as e:
        if verbose:
            print(f"Error in AR model: {str(e)}")
    
    # Time-varying efficiency
    try:
        if len(market_data) < 90:
            return results
        
        # Divide data into periods
        n = len(market_data)
        period_size = n // 3
        
        periods = {
            'early': market_data['log_return'].iloc[:period_size],
            'middle': market_data['log_return'].iloc[period_size:2*period_size],
            'late': market_data['log_return'].iloc[2*period_size:]
        }
        
        period_results = {}
        for name, period_returns in periods.items():
            if len(period_returns) < 30:
                continue
                
            # Test for autocorrelation
            period_acf = acf(period_returns, nlags=5, fft=True)
            sig_level = 1.96 / np.sqrt(len(period_returns))
            has_acf = any(abs(period_acf[i]) > sig_level for i in range(1, len(period_acf)))
            
            # AR model for period
            try:
                period_model = AutoReg(period_returns, lags=1)
                period_fit = period_model.fit()
                ar_coef = period_fit.params[1] if len(period_fit.params) > 1 else 0
                ar_pval = period_fit.pvalues[1] if len(period_fit.pvalues) > 1 else 1
                ar_significant = ar_pval < 0.05
            except:
                ar_coef = None
                ar_pval = None
                ar_significant = False
            
            period_results[name] = {
                'significant_acf': has_acf,
                'ar_coefficient': ar_coef,
                'ar_pvalue': ar_pval,
                'ar_significant': ar_significant,
                'volatility': period_returns.std(),
                'sample_size': len(period_returns)
            }
        
        # Compare early vs late periods
        if 'early' in period_results and 'late' in period_results:
            early = period_results['early']
            late = period_results['late']
            
            efficiency_change = 'No Change'
            if early['ar_significant'] and not late['ar_significant']:
                efficiency_change = 'More Efficient'
            elif not early['ar_significant'] and late['ar_significant']:
                efficiency_change = 'Less Efficient'
                
            volatility_ratio = late['volatility'] / early['volatility'] if early['volatility'] > 0 else 1
            
            period_results['comparison'] = {
                'efficiency_change': efficiency_change,
                'volatility_ratio': volatility_ratio,
                'early_more_inefficient': early['ar_significant'] and not late['ar_significant'],
                'late_more_inefficient': not early['ar_significant'] and late['ar_significant']
            }
        
        results['time_varying'] = period_results
    except Exception as e:
        if verbose:
            print(f"Error in time-varying efficiency analysis: {str(e)}")
    
    return results

def calculate_efficiency_score(test_results):
    """Calculate an overall efficiency score (0-100) from test results"""
    if not test_results:
        return 0
    
    score = 0
    max_score = 0
    
    # Non-stationary price series (random walk)
    if 'adf_price' in test_results:
        max_score += 20
        if not test_results['adf_price']['is_stationary']:
            score += 20
    
    # Stationary returns
    if 'adf_return' in test_results:
        max_score += 20
        if test_results['adf_return']['is_stationary']:
            score += 20
    
    # No significant autocorrelation
    if 'autocorrelation' in test_results:
        max_score += 20
        if not test_results['autocorrelation']['has_significant_autocorrelation']:
            score += 20
    
    # Random runs test
    if 'runs_test' in test_results:
        max_score += 20
        if test_results['runs_test'].get('is_random', False):
            score += 20
    
    # No significant AR model
    if 'ar_model' in test_results:
        max_score += 20
        if not test_results['ar_model'].get('significant', True):
            score += 20
    
    # Calculate total score
    if max_score > 0:
        return (score / max_score) * 100
    else:
        return 0

## Market Analysis
### 5.1 Single Market

In [None]:
def get_market_by_id_or_name(market_id_or_name, main_df, id_column, market_questions):
    """
    Find a market by its ID or name
    
    Parameters:
    -----------
    market_id_or_name : str or int
        Market ID or partial name to search for
    
    Returns:
    --------
    tuple
        (market_id, market_info) or (None, None) if not found
    """
    # Try direct ID match
    if isinstance(market_id_or_name, (int, float)) or market_id_or_name.isdigit():
        market_id = int(market_id_or_name) if market_id_or_name.isdigit() else market_id_or_name
        market_rows = main_df[main_df[id_column] == market_id]
        if not market_rows.empty:
            return market_id, market_rows.iloc[0]
    
    # Try string ID match
    market_rows = main_df[main_df[id_column].astype(str) == str(market_id_or_name)]
    if not market_rows.empty:
        return market_id_or_name, market_rows.iloc[0]
    
    # Try partial name match
    if 'question' in main_df.columns:
        name_matches = main_df[main_df['question'].str.contains(str(market_id_or_name), case=False, na=False)]
        if not name_matches.empty:
            match = name_matches.iloc[0]
            return match[id_column], match
    
    # Try partial match in market_questions values
    for id, question in market_questions.items():
        if str(market_id_or_name).lower() in question.lower():
            market_rows = main_df[main_df[id_column].astype(str) == str(id)]
            if not market_rows.empty:
                return id, market_rows.iloc[0]
    
    return None, None

def analyze_specific_market(market_id_or_name, main_df=None, id_column=None, market_questions=None):
    """
    Analyze a specific market identified by ID or name
    
    Parameters:
    -----------
    market_id_or_name : str or int
        Market ID or name to search for
    
    Returns:
    --------
    dict
        Market analysis results
    """
    if main_df is None:
        main_df, id_column, market_questions = load_data(verbose=False)
    
    # Find the market
    market_id, market_info = get_market_by_id_or_name(market_id_or_name, main_df, id_column, market_questions)
    
    if market_id is None:
        print(f"No market found matching '{market_id_or_name}'")
        return None
    
    # Display market information
    print("\n🔍 Market Information")
    print("-" * 50)
    print(f"Market ID: {market_id}")
    
    # Get the market question
    market_name = None
    if 'question' in market_info:
        market_name = market_info['question']
    else:
        market_name = market_questions.get(str(market_id), f"Market {market_id}")
    
    print(f"Market Question: {market_name}")
    
    # Display additional information if available
    for col, label in [
        ('event_electionType', 'Election Type'),
        ('event_country', 'Country'),
        ('volumeNum', 'Trading Volume'),
        ('market_duration_days', 'Market Duration (days)')
    ]:
        if col in market_info and not pd.isna(market_info[col]):
            print(f"{label}: {market_info[col]}")
    
    # Process the market data
    print("\nProcessing market data...")
    market_data = preprocess_market_data(market_id, verbose=True)
    
    if market_data is None:
        print("❌ Failed to process market data")
        return None
    
    print(f"✅ Successfully processed market data with {len(market_data)} time points")
    
    # Run efficiency tests
    print("\nRunning market efficiency tests...")
    test_results = run_efficiency_tests(market_data, verbose=True)
    
    if test_results is None:
        print("❌ Failed to run efficiency tests")
        return None
    
    # Calculate efficiency score
    efficiency_score = calculate_efficiency_score(test_results)
    
    # Determine efficiency class
    if efficiency_score >= 80:
        efficiency_class = 'Highly Efficient'
    elif efficiency_score >= 60:
        efficiency_class = 'Moderately Efficient'
    elif efficiency_score >= 40:
        efficiency_class = 'Slightly Inefficient'
    else:
        efficiency_class = 'Highly Inefficient'
    
    print(f"\n📊 Market Efficiency Score: {efficiency_score:.2f}/100")
    print(f"📈 Efficiency Classification: {efficiency_class}")
    
    # Print detailed test results
    print("\n🔬 Detailed Test Results:")
    
    if 'adf_price' in test_results:
        is_random_walk = not test_results['adf_price']['is_stationary']
        print(f"Random Walk Test (Non-stationary prices): {'✅ Pass' if is_random_walk else '❌ Fail'}")
    
    if 'adf_return' in test_results:
        is_return_stationary = test_results['adf_return']['is_stationary']
        print(f"Return Stationarity Test: {'✅ Pass' if is_return_stationary else '❌ Fail'}")
    
    if 'autocorrelation' in test_results:
        no_autocorr = not test_results['autocorrelation']['has_significant_autocorrelation']
        print(f"No Significant Autocorrelation: {'✅ Pass' if no_autocorr else '❌ Fail'}")
    
    if 'runs_test' in test_results:
        is_random = test_results['runs_test'].get('is_random', False)
        print(f"Runs Test (Randomness): {'✅ Pass' if is_random else '❌ Fail'}")
    
    if 'ar_model' in test_results:
        no_ar = not test_results['ar_model'].get('significant', True)
        print(f"No Significant AR Model: {'✅ Pass' if no_ar else '❌ Fail'}")
    
    # Create visualizations
    print("\nGenerating visualizations...")
    visualize_market(market_data, market_name, test_results)
    
    # Prepare final results
    results = {
        'market_id': market_id,
        'market_name': market_name,
        'test_results': test_results,
        'efficiency_score': efficiency_score,
        'efficiency_class': efficiency_class
    }
    
    # Add market attributes
    for col in ['event_electionType', 'event_country', 'volumeNum', 'market_duration_days']:
        if col in market_info and not pd.isna(market_info[col]):
            results[col] = market_info[col]
    
    return results

def visualize_market(market_data, market_name, test_results=None):
    """Create visualizations for a specific market"""
    # Create a 2x2 plot grid
    fig, axs = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Price Series
    axs[0, 0].plot(market_data.index, market_data['price'], linewidth=2)
    axs[0, 0].set_title(f'Price Series: {market_name}', fontsize=14)
    axs[0, 0].set_xlabel('Date', fontsize=12)
    axs[0, 0].set_ylabel('Price', fontsize=12)
    axs[0, 0].grid(True, alpha=0.3)
    
    # 2. Log Returns
    axs[0, 1].plot(market_data.index, market_data['log_return'], linewidth=1, color='green')
    axs[0, 1].set_title(f'Log Returns: {market_name}', fontsize=14)
    axs[0, 1].set_xlabel('Date', fontsize=12)
    axs[0, 1].set_ylabel('Log Return', fontsize=12)
    axs[0, 1].grid(True, alpha=0.3)
    
    # 3. ACF Plot
    if test_results and 'autocorrelation' in test_results:
        acf_values = test_results['autocorrelation']['acf_values']
        significant = test_results['autocorrelation']['has_significant_autocorrelation']
        
        lags = range(len(acf_values))
        axs[1, 0].bar(lags, acf_values, width=0.3)
        
        # Plot confidence intervals for hypothesis testing
        ci = 1.96 / np.sqrt(len(market_data))
        axs[1, 0].axhline(y=0, linestyle='-', color='black', linewidth=1)
        axs[1, 0].axhline(y=ci, linestyle='--', color='red', linewidth=1, alpha=0.7)
        axs[1, 0].axhline(y=-ci, linestyle='--', color='red', linewidth=1, alpha=0.7)
        
        title = f'Autocorrelation Function: {"❌ Significant" if significant else "✅ Not Significant"}'
        axs[1, 0].set_title(title, fontsize=14)
        axs[1, 0].set_xlabel('Lag', fontsize=12)
        axs[1, 0].set_ylabel('ACF', fontsize=12)
    else:
        axs[1, 0].set_title('Autocorrelation Function: Not Available', fontsize=14)
    
    # 4. Price distribution
    axs[1, 1].hist(market_data['price'], bins=30, alpha=0.7, density=True)
    axs[1, 1].set_title(f'Price Distribution: {market_name}', fontsize=14)
    axs[1, 1].set_xlabel('Price', fontsize=12)
    axs[1, 1].set_ylabel('Density', fontsize=12)
    
    plt.tight_layout()
    plt.show()
    
    # If time-varying efficiency results are available, show those too
    if test_results and 'time_varying' in test_results and 'comparison' in test_results['time_varying']:
        comparison = test_results['time_varying']['comparison']
        
        plt.figure(figsize=(10, 6))
        periods = ['Early', 'Middle', 'Late']
        
        # Extract volatility for each period
        volatilities = []
        for period in ['early', 'middle', 'late']:
            if period in test_results['time_varying']:
                volatilities.append(test_results['time_varying'][period]['volatility'])
            else:
                volatilities.append(np.nan)
        
        # Create the bar chart
        bars = plt.bar(periods, volatilities, color=['blue', 'green', 'orange'])
        
        plt.title(f'Return Volatility by Market Period: {comparison["efficiency_change"]}', fontsize=14)
        plt.ylabel('Return Volatility', fontsize=12)
        plt.grid(axis='y', alpha=0.3)
        
        # Add efficiency change information
        plt.figtext(0.5, 0.01, f'Efficiency Change: {comparison["efficiency_change"]}', 
                   ha='center', fontsize=12, bbox={"facecolor":"lightgray", "alpha":0.5, "pad":5})
        
        plt.tight_layout()
        plt.show()

### 5.2 Run Analysis on All Selected Markets

In [None]:
def analyze_multiple_markets(market_ids=None, max_markets=100, use_parallel=True, save_results=True):
    """
    Analyze multiple markets efficiently with memory optimization
    
    Parameters:
    -----------
    market_ids : list, optional
        List of market IDs to analyze. If None, selects by volume.
    max_markets : int
        Maximum number of markets to analyze
    use_parallel : bool
        Whether to use parallel processing
    save_results : bool
        Whether to save results to disk
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with market efficiency results
    """
    global main_df, id_column
    
    # If no specific markets are provided, select by volume
    if market_ids is None:
        # Select markets by volume if available
        if 'volumeNum' in main_df.columns:
            market_ids = main_df.sort_values('volumeNum', ascending=False)[id_column].unique()[:max_markets]
        else:
            # Fall back to random selection
            market_ids = main_df[id_column].sample(min(max_markets, len(main_df))).values
    else:
        # Limit to max_markets
        market_ids = market_ids[:max_markets]
    
    print(f"Analyzing {len(market_ids)} markets...")
    
    # Setup results storage
    results = []
    results_file = os.path.join(results_dir, 'market_efficiency_results.json')
    
    # Load any existing results
    if save_results and os.path.exists(results_file):
        try:
            with open(results_file, 'r') as f:
                existing_results = json.load(f)
                
            # Extract already processed market IDs
            processed_ids = [r.get('market_id') for r in existing_results]
            
            # Filter out already processed markets
            market_ids = [mid for mid in market_ids if mid not in processed_ids]
            
            results = existing_results
            print(f"Loaded {len(existing_results)} existing results, {len(market_ids)} markets remaining")
        except:
            print("Could not load existing results, starting fresh")
    
    # Helper function for single market analysis
    def analyze_market(market_id):
        try:
            # Process market data
            market_data = preprocess_market_data(market_id)
            if market_data is None or len(market_data) < 30:
                return None
            
            # Run efficiency tests
            test_results = run_efficiency_tests(market_data)
            if test_results is None:
                return None
            
            # Calculate efficiency score
            efficiency_score = calculate_efficiency_score(test_results)
            
            # Determine efficiency class
            if efficiency_score >= 80:
                efficiency_class = 'Highly Efficient'
            elif efficiency_score >= 60:
                efficiency_class = 'Moderately Efficient'
            elif efficiency_score >= 40:
                efficiency_class = 'Slightly Inefficient'
            else:
                efficiency_class = 'Highly Inefficient'
            
            # Get market metadata
            market_info = {}
            market_rows = main_df[main_df[id_column] == market_id]
            if len(market_rows) == 0:
                # Try string comparison
                market_rows = main_df[main_df[id_column].astype(str) == str(market_id)]
            
            if len(market_rows) > 0:
                row = market_rows.iloc[0]
                for col in ['event_electionType', 'event_country', 'volumeNum', 'market_duration_days', 'question']:
                    if col in row and not pd.isna(row[col]):
                        market_info[col] = row[col]
            
            # Get market name
            market_name = market_info.get('question', market_questions.get(str(market_id), f"Market {market_id}"))
            
            # Create result
            result = {
                'market_id': market_id,
                'market_name': market_name,
                'efficiency_score': efficiency_score,
                'efficiency_class': efficiency_class
            }
            
            # Add market metadata
            for key, value in market_info.items():
                result[key] = value
            
            # Add key test results (avoiding large nested structures)
            if 'adf_price' in test_results:
                result['price_stationary'] = test_results['adf_price']['is_stationary']
            
            if 'adf_return' in test_results:
                result['return_stationary'] = test_results['adf_return']['is_stationary']
            
            if 'autocorrelation' in test_results:
                result['has_autocorrelation'] = test_results['autocorrelation']['has_significant_autocorrelation']
            
            if 'runs_test' in test_results:
                result['is_random'] = test_results['runs_test'].get('is_random', False)
            
            if 'ar_model' in test_results:
                result['ar_significant'] = test_results['ar_model'].get('significant', False)
                result['ar_coefficient'] = test_results['ar_model'].get('ar_coefficient', None)
            
            if 'time_varying' in test_results and 'comparison' in test_results['time_varying']:
                result['efficiency_change'] = test_results['time_varying']['comparison']['efficiency_change']
            
            return result
        except Exception as e:
            print(f"Error analyzing market {market_id}: {str(e)}")
            return None
    
    # Analysis with progress tracking and incremental saving
    if use_parallel and len(market_ids) > 10:
        from concurrent.futures import ProcessPoolExecutor, as_completed
        import multiprocessing
        
        # Determine number of workers
        n_workers = max(1, min(multiprocessing.cpu_count() - 1, 4))  # Limit to 4 workers max
        
        with ProcessPoolExecutor(max_workers=n_workers) as executor:
            # Submit jobs
            futures = {executor.submit(analyze_market, mid): mid for mid in market_ids}
            
            # Process results as they complete
            for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Analyzing markets")):
                result = future.result()
                if result is not None:
                    results.append(result)
                
                # Save incrementally every 10 markets
                if save_results and (i + 1) % 10 == 0:
                    try:
                        with open(results_file, 'w') as f:
                            json.dump(results, f)
                    except Exception as e:
                        print(f"Error saving intermediate results: {str(e)}")
    else:
        # Serial processing
        for i, market_id in enumerate(tqdm(market_ids, desc="Analyzing markets")):
            result = analyze_market(market_id)
            if result is not None:
                results.append(result)
            
            # Save incrementally every 10 markets
            if save_results and (i + 1) % 10 == 0:
                try:
                    with open(results_file, 'w') as f:
                        json.dump(results, f)
                except Exception as e:
                    print(f"Error saving intermediate results: {str(e)}")
    
    # Final save
    if save_results:
        try:
            with open(results_file, 'w') as f:
                json.dump(results, f)
            print(f"Saved {len(results)} results to {results_file}")
        except Exception as e:
            print(f"Error saving final results: {str(e)}")
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Print summary
    if len(results_df) > 0:
        print("\n📊 Analysis Summary")
        print("-" * 50)
        print(f"Total markets analyzed: {len(results_df)}")
        print(f"Average efficiency score: {results_df['efficiency_score'].mean():.2f}/100")
        
        # Efficiency classification breakdown
        print("\nEfficiency Classification:")
        for cls, count in results_df['efficiency_class'].value_counts().items():
            print(f"  {cls}: {count} markets ({count/len(results_df)*100:.1f}%)")
    
    return results_df

### 5.3 Visualize Results

In [None]:
def visualize_efficiency_results(results_df, save_dir=None):
    """
    Create visualizations for market efficiency results
    
    Parameters:
    -----------
    results_df : pd.DataFrame
        DataFrame with market efficiency results
    save_dir : str, optional
        Directory to save plots, if None uses results_dir
    """
    if results_df is None or len(results_df) == 0:
        print("No results to visualize")
        return
    
    if save_dir is None:
        save_dir = results_dir
    
    os.makedirs(save_dir, exist_ok=True)
    
    # 1. Efficiency Score Distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(results_df['efficiency_score'], bins=20, kde=True)
    plt.axvline(x=results_df['efficiency_score'].mean(), color='red', linestyle='--', 
               label=f'Mean: {results_df["efficiency_score"].mean():.2f}')
    plt.title('Distribution of Market Efficiency Scores', fontsize=14)
    plt.xlabel('Efficiency Score (0-100, higher = more efficient)', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.grid(alpha=0.3)
    plt.legend()
    
    plt.savefig(os.path.join(save_dir, 'efficiency_score_distribution.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Efficiency Classification Pie Chart
    plt.figure(figsize=(10, 8))
    results_df['efficiency_class'].value_counts().plot.pie(autopct='%1.1f%%', 
                                                         colors=sns.color_palette("viridis", 4),
                                                         startangle=90)
    plt.title('Market Efficiency Classification', fontsize=14)
    plt.ylabel('')  # Hide ylabel
    
    plt.savefig(os.path.join(save_dir, 'efficiency_classification_pie.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    # 3. Efficiency by Market Type (if available)
    if 'event_electionType' in results_df.columns:
        type_counts = results_df['event_electionType'].value_counts()
        types_with_data = type_counts[type_counts >= 5].index.tolist()
        
        if types_with_data:
            plt.figure(figsize=(12, 6))
            
            # Calculate average efficiency by type
            type_data = []
            for market_type in types_with_data:
                type_df = results_df[results_df['event_electionType'] == market_type]
                type_data.append({
                    'Market Type': market_type,
                    'Average Efficiency': type_df['efficiency_score'].mean(),
                    'Count': len(type_df)
                })
            
            type_df = pd.DataFrame(type_data).sort_values('Average Efficiency', ascending=False)
            
            # Create bar chart
            bars = plt.bar(type_df['Market Type'], type_df['Average Efficiency'], color='lightgreen')
            
            # Add count labels
            for i, bar in enumerate(bars):
                plt.text(bar.get_x() + bar.get_width()/2, 
                        bar.get_height() + 1, 
                        f"n={type_df['Count'].iloc[i]}", 
                        ha='center', va='bottom', fontsize=10)
            
            plt.axhline(y=results_df['efficiency_score'].mean(), color='red', linestyle='--', 
                       label=f'Overall Average: {results_df["efficiency_score"].mean():.2f}')
            
            plt.title('Average Efficiency Score by Market Type', fontsize=14)
            plt.xlabel('Market Type', fontsize=12)
            plt.ylabel('Average Efficiency Score', fontsize=12)
            plt.xticks(rotation=45, ha='right')
            plt.legend()
            plt.ylim(0, 100)
            plt.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            
            plt.savefig(os.path.join(save_dir, 'efficiency_by_market_type.png'), dpi=300, bbox_inches='tight')
            plt.show()
    
    # 4. Efficiency by Country (if available)
    if 'event_country' in results_df.columns:
        country_counts = results_df['event_country'].value_counts()
        countries_with_data = country_counts[country_counts >= 5].index.tolist()
        
        if countries_with_data:
            plt.figure(figsize=(12, 6))
            
            country_data = []
            for country in countries_with_data:
                country_df = results_df[results_df['event_country'] == country]
                country_data.append({
                    'Country': country,
                    'Average Efficiency': country_df['efficiency_score'].mean(),
                    'Count': len(country_df)
                })
            
            country_df = pd.DataFrame(country_data).sort_values('Average Efficiency', ascending=False)
            
            bars = plt.bar(country_df['Country'], country_df['Average Efficiency'], color='skyblue')
            
            for i, bar in enumerate(bars):
                plt.text(bar.get_x() + bar.get_width()/2, 
                        bar.get_height() + 1, 
                        f"n={country_df['Count'].iloc[i]}", 
                        ha='center', va='bottom', fontsize=10)
            
            plt.axhline(y=results_df['efficiency_score'].mean(), color='red', linestyle='--', 
                       label=f'Overall Average: {results_df["efficiency_score"].mean():.2f}')
            
            plt.title('Average Efficiency Score by Country', fontsize=14)
            plt.xlabel('Country', fontsize=12)
            plt.ylabel('Average Efficiency Score', fontsize=12)
            plt.xticks(rotation=45, ha='right')
            plt.legend()
            plt.ylim(0, 100)
            plt.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            
            plt.savefig(os.path.join(save_dir, 'efficiency_by_country.png'), dpi=300, bbox_inches='tight')
            plt.show()
    
    # 5. Efficiency vs Volume (if available)
    if 'volumeNum' in results_df.columns:
        plt.figure(figsize=(10, 6))
        
        # Use log scale for volume
        plt.scatter(results_df['volumeNum'], results_df['efficiency_score'], alpha=0.6)
        plt.xscale('log')
        
        # Add trend line
        try:
            z = np.polyfit(np.log10(results_df['volumeNum']), results_df['efficiency_score'], 1)
            p = np.poly1d(z)
            
            # Create x range for line (in log space)
            x_range = np.logspace(
                np.log10(results_df['volumeNum'].min()), 
                np.log10(results_df['volumeNum'].max()), 
                100
            )
            
            plt.plot(x_range, p(np.log10(x_range)), "r--", linewidth=2)
            
            # Calculate correlation
            corr = np.corrcoef(np.log10(results_df['volumeNum']), results_df['efficiency_score'])[0, 1]
            plt.text(0.05, 0.95, f"Correlation: {corr:.3f}", transform=plt.gca().transAxes,
                    bbox=dict(facecolor='white', alpha=0.8))
        except:
            pass
        
        plt.title('Efficiency Score vs Trading Volume', fontsize=14)
        plt.xlabel('Trading Volume (log scale)', fontsize=12)
        plt.ylabel('Efficiency Score', fontsize=12)
        plt.grid(alpha=0.3)
        
        plt.savefig(os.path.join(save_dir, 'efficiency_vs_volume.png'), dpi=300, bbox_inches='tight')
        plt.show()
    
    # 6. Time-varying efficiency results
    if 'efficiency_change' in results_df.columns:
        efficiency_changes = results_df['efficiency_change'].value_counts()
        
        plt.figure(figsize=(10, 6))
        bars = plt.bar(efficiency_changes.index, efficiency_changes.values, color=['green', 'gray', 'red'])
        
        # Add percentage labels
        total = len(results_df)
        for i, (category, count) in enumerate(efficiency_changes.items()):
            plt.text(i, count + 0.5, f"{count/total*100:.1f}%", ha='center', fontsize=12)
        
        plt.title('Efficiency Change Over Market Lifecycle', fontsize=14)
        plt.ylabel('Number of Markets', fontsize=12)
        plt.grid(axis='y', alpha=0.3)
        
        plt.savefig(os.path.join(save_dir, 'time_varying_efficiency.png'), dpi=300, bbox_inches='tight')
        plt.show()

## 7. Cross-Market Analysis (Market Relatedness)

In [None]:
def find_related_markets(event_identifier, main_df=None, id_column=None):
    """
    Find markets that are part of the same event
    
    Parameters:
    -----------
    event_identifier : str or int
        Event ID or name to search for
    
    Returns:
    --------
    list
        List of market IDs in the same event
    """
    if main_df is None:
        main_df, id_column, _ = load_data(verbose=False)
    
    # Try to find the event ID column
    event_col = None
    for col in ['event_id', 'groupId', 'group_id', 'event']:
        if col in main_df.columns:
            event_col = col
            break
    
    if event_col is None:
        print("Could not find an event identifier column")
        return []
    
    # If an event ID is provided directly
    event_markets = main_df[main_df[event_col] == event_identifier]
    
    # If it's a string, try searching event/question text
    if len(event_markets) == 0 and isinstance(event_identifier, str):
        # Try matching in event name or question
        for col in ['question', 'event_name', 'eventName', 'name']:
            if col in main_df.columns:
                event_markets = main_df[main_df[col].str.contains(event_identifier, case=False, na=False)]
                if len(event_markets) > 0:
                    break
    
    if len(event_markets) == 0:
        print(f"No markets found for event '{event_identifier}'")
        return []
    
    print(f"Found {len(event_markets)} markets in event '{event_identifier}'")
    return event_markets[id_column].unique().tolist()

def analyze_cross_market_predictability(market_ids, max_lag=3, verbose=False):
    """
    Test for Granger causality between related markets
    
    Parameters:
    -----------
    market_ids : list
        List of market IDs to analyze
    max_lag : int
        Maximum lag for Granger causality test
    verbose : bool
        Whether to print detailed output
        
    Returns:
    --------
    list
        List of causality results between market pairs
    """
    if len(market_ids) < 2:
        print("Need at least 2 markets for cross-market analysis")
        return None
    
    # Process each market data
    market_data = {}
    for market_id in tqdm(market_ids, desc="Processing markets", disable=not verbose):
        data = preprocess_market_data(market_id, resample='5min')  # Use wider intervals for cross-market
        
        if data is not None and len(data) > max_lag + 5:
            market_name = market_questions.get(str(market_id), f"Market {market_id}")
            market_data[market_id] = {
                'data': data,
                'name': market_name
            }
    
    if len(market_data) < 2:
        print("Insufficient data for cross-market analysis")
        return None
    
    print(f"Analyzing relationships between {len(market_data)} markets...")
    
    # Pairwise Granger causality tests
    causality_results = []
    
    for i, (market_i, data_i) in enumerate(market_data.items()):
        for j, (market_j, data_j) in enumerate(market_data.items()):
            if i >= j:  # Skip self-comparisons and duplicates
                continue
            
            # Align time series
            common_index = data_i['data'].index.intersection(data_j['data'].index)
            if len(common_index) <= max_lag + 5:
                if verbose:
                    print(f"Insufficient overlapping data for {market_i} and {market_j}")
                continue
                
            series_i = data_i['data'].loc[common_index, 'price']
            series_j = data_j['data'].loc[common_index, 'price']
            
            # Test if market i Granger-causes market j
            try:
                # i -> j
                gc_result_ij = grangercausalitytests(
                    pd.concat([series_j, series_i], axis=1), 
                    maxlag=max_lag, 
                    verbose=False
                )
                min_pvalue_ij = min([res[0]['ssr_chi2test'][1] for lag, res in gc_result_ij.items()])
                
                # j -> i
                gc_result_ji = grangercausalitytests(
                    pd.concat([series_i, series_j], axis=1), 
                    maxlag=max_lag, 
                    verbose=False
                )
                min_pvalue_ji = min([res[0]['ssr_chi2test'][1] for lag, res in gc_result_ji.items()])
                
                result = {
                    'market_i_id': market_i,
                    'market_j_id': market_j,
                    'market_i_name': data_i['name'],
                    'market_j_name': data_j['name'],
                    'i_causes_j_pvalue': min_pvalue_ij,
                    'j_causes_i_pvalue': min_pvalue_ji,
                    'i_causes_j': min_pvalue_ij < 0.05,
                    'j_causes_i': min_pvalue_ji < 0.05,
                    'bidirectional': min_pvalue_ij < 0.05 and min_pvalue_ji < 0.05,
                    'relationship': 'Bidirectional' if min_pvalue_ij < 0.05 and min_pvalue_ji < 0.05 else
                                  f"{data_i['name']} -> {data_j['name']}" if min_pvalue_ij < 0.05 else
                                  f"{data_j['name']} -> {data_i['name']}" if min_pvalue_ji < 0.05 else
                                  'No relationship'
                }
                
                causality_results.append(result)
                
                if verbose:
                    print(f"Relationship between {data_i['name']} and {data_j['name']}: {result['relationship']}")
                
            except Exception as e:
                if verbose:
                    print(f"Error in Granger causality test between {market_i} and {market_j}: {str(e)}")
    
    # Print summary
    significant_count = sum(1 for r in causality_results if r['i_causes_j'] or r['j_causes_i'])
    bidirectional_count = sum(1 for r in causality_results if r['bidirectional'])
    
    print(f"\nCross-market analysis complete:")
    print(f"  - Tested {len(causality_results)} market pairs")
    print(f"  - Found {significant_count} significant relationships ({significant_count/len(causality_results)*100:.1f}%)")
    print(f"  - Found {bidirectional_count} bidirectional relationships ({bidirectional_count/len(causality_results)*100:.1f}%)")
    
    return causality_results

def visualize_cross_market_results(causality_results, save_dir=None):
    """
    Visualize cross-market predictability results
    
    Parameters:
    -----------
    causality_results : list
        List of causality results from analyze_cross_market_predictability
    save_dir : str, optional
        Directory to save plots, if None uses results_dir
    """
    if causality_results is None or len(causality_results) == 0:
        print("No results to visualize")
        return
    
    if save_dir is None:
        save_dir = results_dir
    
    os.makedirs(save_dir, exist_ok=True)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(causality_results)
    
    # 1. Relationship type counts
    plt.figure(figsize=(10, 6))
    relationship_counts = results_df['relationship'].apply(
        lambda x: 'Bidirectional' if 'Bidirectional' in x else 'Unidirectional' if '->' in x else 'No relationship'
    ).value_counts()
    
    colors = ['green', 'orange', 'gray']
    bars = plt.bar(relationship_counts.index, relationship_counts.values, color=colors)
    
    # Add percentage labels
    total = len(results_df)
    for i, (category, count) in enumerate(relationship_counts.items()):
        plt.text(i, count + 0.5, f"{count/total*100:.1f}%", ha='center', fontsize=12)
    
    plt.title('Cross-Market Relationships', fontsize=14)
    plt.ylabel('Number of Market Pairs', fontsize=12)
    plt.grid(axis='y', alpha=0.3)
    
    plt.savefig(os.path.join(save_dir, 'cross_market_relationships.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Create network visualization if NetworkX is available
    try:
        import networkx as nx
        
        # Create directed graph
        G = nx.DiGraph()
        
        # Add nodes
        all_markets = set()
        for result in causality_results:
            all_markets.add((result['market_i_id'], result['market_i_name']))
            all_markets.add((result['market_j_id'], result['market_j_name']))
        
        for market_id, market_name in all_markets:
            G.add_node(market_id, name=market_name)
        
        # Add edges
        for result in causality_results:
            if result['i_causes_j']:
                G.add_edge(result['market_i_id'], result['market_j_id'], 
                          weight=1-result['i_causes_j_pvalue'])
            
            if result['j_causes_i']:
                G.add_edge(result['market_j_id'], result['market_i_id'],
                          weight=1-result['j_causes_i_pvalue'])
        
        # Create plot
        plt.figure(figsize=(12, 10))
        
        # Use spring layout
        pos = nx.spring_layout(G, k=0.5, iterations=100)
        
        # Draw nodes
        nx.draw_networkx_nodes(G, pos, node_size=500, node_color='skyblue', alpha=0.8)
        
        # Draw edges
        edges = G.edges(data=True)
        weights = [d['weight']*3 for _, _, d in edges]  # Scale weights for visibility
        nx.draw_networkx_edges(G, pos, width=weights, alpha=0.5, 
                              arrows=True, arrowsize=15)
        
        # Add labels
        labels = {node: G.nodes[node]['name'].split()[-1] for node in G.nodes()}
        nx.draw_networkx_labels(G, pos, labels=labels, font_size=10)
        
        plt.title('Cross-Market Information Flow Network', fontsize=16)
        plt.axis('off')
        
        plt.savefig(os.path.join(save_dir, 'cross_market_network.png'), dpi=300, bbox_inches='tight')
        plt.show()
    except:
        print("NetworkX not available, skipping network visualization")

## 8. User Interface Functions

In [None]:
def select_and_analyze_market(market_identifier=None):
    """
    UI function for the notebook to select and analyze a specific market
    
    Parameters:
    -----------
    market_identifier : str or int, optional
        Market ID or name to analyze. If None, presents a selection interface.
    """
    global main_df, id_column, market_questions
    
    if market_identifier is None:
        # List top markets by volume for selection
        if 'volumeNum' in main_df.columns:
            top_markets = main_df.sort_values('volumeNum', ascending=False).head(10)
        else:
            top_markets = main_df.head(10)
        
        print("Please select a market by entering its ID or name, or choose from these top markets:")
        for i, (_, row) in enumerate(top_markets.iterrows()):
            market_id = row[id_column]
            market_name = row['question'] if 'question' in row else market_questions.get(str(market_id), f"Market {market_id}")
            print(f"{i+1}. ID: {market_id} - {market_name}")
        
        # Let the user input their choice
        choice = input("Enter market ID, name, or number from list: ")
        
        if choice.isdigit() and int(choice) <= len(top_markets):
            # User selected by number from the list
            market_id = top_markets.iloc[int(choice)-1][id_column]
        else:
            # User entered ID or name
            market_identifier = choice
    
    # Analyze the selected market
    result = analyze_specific_market(market_identifier)
    return result

def run_comprehensive_analysis(num_markets=100, use_parallel=True):
    """
    UI function for the notebook to run comprehensive analysis on multiple markets
    
    Parameters:
    -----------
    num_markets : int
        Number of markets to analyze
    use_parallel : bool
        Whether to use parallel processing
    """
    # Run the analysis
    results_df = analyze_multiple_markets(max_markets=num_markets, use_parallel=use_parallel)
    
    # Visualize results
    if results_df is not None and len(results_df) > 0:
        visualize_efficiency_results(results_df)
    
    return results_df

def analyze_event_markets(event_identifier):
    """
    UI function for the notebook to analyze cross-market relationships within an event
    
    Parameters:
    -----------
    event_identifier : str or int
        Event ID or name to analyze
    """
    # Find related markets
    related_markets = find_related_markets(event_identifier)
    
    if not related_markets or len(related_markets) < 2:
        print("Insufficient related markets found for analysis")
        return None
    
    # Run cross-market analysis
    results = analyze_cross_market_predictability(related_markets, verbose=True)
    
    # Visualize results
    if results:
        visualize_cross_market_results(results)
    
    return results