# Market Efficiency Analysis in Prediction Markets

This notebook analyzes the efficiency of Polymarket prediction markets using various statistical tests to evaluate whether these markets follow the "wisdom of crowds" hypothesis.

## 1. Setup and Initialization

In [124]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import acf, pacf, adfuller
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from scipy import stats
import warnings
from tqdm.auto import tqdm
import json

# Add the src directory to the path if it isn't already there
if 'src' not in sys.path:
    sys.path.append('src')

# Import the MarketEfficiencyAnalyzer class
from src.knowledge_value.market_efficiency import MarketEfficiencyAnalyzer

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Create output directory for saving results
results_dir = 'results/knowledge_value/efficiency'
os.makedirs(results_dir, exist_ok=True)

Loaded dataset with 1048575 rows and 54 columns
Loaded dataset with 1048575 rows and 54 columns


AttributeError: 'NoneType' object has no attribute 'empty'

## 2. Initialize the Analyzer

In [122]:
# Initialize the analyzer with appropriate paths
analyzer = MarketEfficiencyAnalyzer(
    data_dir='data',  # Path to data directory
    results_dir=results_dir,  # Path to save results
    max_cache_size=50  # Maximum number of markets to cache
)

# Print basic dataset information
print(f"Dataset contains {len(analyzer.main_df)} markets")
print(f"Available market questions: {len(analyzer.market_questions)}")

# Display available event types and countries
if 'event_electionType' in analyzer.main_df.columns:
    print("\nAvailable Election Types:")
    for event_type, count in analyzer.main_df['event_electionType'].value_counts().items():
        print(f"  {event_type}: {count} markets")

if 'event_country' in analyzer.main_df.columns:
    print("\nTop Countries:")
    for country, count in analyzer.main_df['event_country'].value_counts().head(10).items():
        print(f"  {country}: {count} markets")

Loaded dataset with 1048575 rows and 54 columns
Loaded dataset with 1048575 rows and 54 columns


TypeError: object of type 'NoneType' has no len()

## 3. Single Market Analysis

Let's analyze a specific market to understand its efficiency characteristics.

In [None]:
# Find markets related to the 2024 US Presidential Election
us_presidential_markets = analyzer.find_market_by_name("Trump 2024")

# Display matching markets
print("Markets related to 2024 US Presidential Election:")
for i, (market_id, question) in enumerate(us_presidential_markets[:10]):  # Limit to 10 results
    print(f"{i+1}. {question} (ID: {market_id})")

# Select a market to analyze (for example, the first match)
if us_presidential_markets:
    selected_market_id = us_presidential_markets[0][0]
    selected_market_name = us_presidential_markets[0][1]
    print(f"\nAnalyzing market: {selected_market_name} (ID: {selected_market_id})")
    
    # Run detailed analysis
    market_result = analyzer.analyze_market(selected_market_id, verbose=True)
    
    # Create visualizations
    market_figures = analyzer.visualize_market(selected_market_id, save_to=f"{results_dir}/market_{selected_market_id}_viz.png")
    
    # Save results
    analyzer.save_results(market_result, f"market_{selected_market_id}_analysis.json")
else:
    print("No markets found matching the search criteria.")

Markets related to 2024 US Presidential Election:
No markets found matching the search criteria.


## 4. Finding Efficient vs. Inefficient Markets

Let's identify examples of highly efficient and inefficient markets for comparison.

In [None]:
# Get top markets by volume for analysis
top_volume_markets = analyzer.get_top_markets_by_volume(n=20)

# Analyze these markets
print(f"Analyzing {len(top_volume_markets)} high-volume markets...")
top_markets_results = analyzer.analyze_market_batch(top_volume_markets, max_markets=20, parallel=True)

# Sort markets by efficiency score
if top_markets_results:
    # Convert to DataFrame for easier sorting and filtering
    results_df = pd.DataFrame([
        {
            'market_id': r['market_id'],
            'question': r.get('question', 'Unknown'),
            'efficiency_score': r.get('efficiency_score', 0),
            'efficiency_class': r.get('efficiency_class', 'Unknown'),
            'event_type': r.get('event_type', 'Unknown'),
            'country': r.get('country', 'Unknown')
        }
        for r in top_markets_results
    ])
    
    # Sort by efficiency score
    sorted_results = results_df.sort_values('efficiency_score', ascending=False)
    
    # Display most efficient markets
    print("\nMost Efficient Markets:")
    for _, row in sorted_results.head(5).iterrows():
        print(f"  {row['question']} - Score: {row['efficiency_score']:.2f} - Class: {row['efficiency_class']}")
    
    # Display least efficient markets
    print("\nLeast Efficient Markets:")
    for _, row in sorted_results.tail(5).iterrows():
        print(f"  {row['question']} - Score: {row['efficiency_score']:.2f} - Class: {row['efficiency_class']}")
    
    # Select one efficient and one inefficient market for comparison
    efficient_market = sorted_results.iloc[0]['market_id']
    inefficient_market = sorted_results.iloc[-1]['market_id']
    
    # Create side-by-side visualization
    plt.figure(figsize=(20, 10))
    plt.suptitle("Comparison of Efficient vs. Inefficient Markets", fontsize=16)
    
    # Visualize efficient market
    efficient_data = analyzer.preprocess_market_data(efficient_market)
    if efficient_data is not None:
        plt.subplot(2, 2, 1)
        plt.plot(efficient_data.index, efficient_data['price'])
        plt.title(f"Efficient Market: {sorted_results[sorted_results['market_id'] == efficient_market]['question'].values[0]}")
        plt.ylabel("Price")
        
        plt.subplot(2, 2, 2)
        plt.plot(efficient_data.index, efficient_data['log_return'], color='green')
        plt.title("Log Returns")
    
    # Visualize inefficient market
    inefficient_data = analyzer.preprocess_market_data(inefficient_market)
    if inefficient_data is not None:
        plt.subplot(2, 2, 3)
        plt.plot(inefficient_data.index, inefficient_data['price'])
        plt.title(f"Inefficient Market: {sorted_results[sorted_results['market_id'] == inefficient_market]['question'].values[0]}")
        plt.ylabel("Price")
        
        plt.subplot(2, 2, 4)
        plt.plot(inefficient_data.index, inefficient_data['log_return'], color='red')
        plt.title("Log Returns")
    
    plt.tight_layout()
    plt.savefig(f"{results_dir}/efficient_vs_inefficient.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No results from batch analysis.")

AttributeError: 'NoneType' object has no attribute 'columns'

## 5. Comparative Analysis by Market Type

Let's compare market efficiency across different election types.

In [None]:
# Get metrics by election type
election_types = ['Presidential', 'Senate', 'Parliamentary']
comparison_results = []

for election_type in election_types:
    # Get markets for this type
    markets = analyzer.get_markets_by_event_type(election_type, n=10)
    
    print(f"Analyzing {len(markets)} {election_type} markets...")
    
    # Analyze markets
    type_results = analyzer.analyze_market_batch(markets, max_markets=10)
    
    # Aggregate results
    if type_results:
        avg_score = np.mean([r.get('efficiency_score', 0) for r in type_results])
        price_stationary = sum(1 for r in type_results if r.get('adf_price', {}).get('is_stationary', False))
        return_stationary = sum(1 for r in type_results if r.get('adf_return', {}).get('is_stationary', False))
        has_autocorr = sum(1 for r in type_results if r.get('autocorrelation', {}).get('has_significant_autocorrelation', False))
        
        comparison_results.append({
            'election_type': election_type,
            'count': len(type_results),
            'avg_score': avg_score,
            'price_stationary_pct': price_stationary / len(type_results) * 100 if len(type_results) > 0 else 0,
            'return_stationary_pct': return_stationary / len(type_results) * 100 if len(type_results) > 0 else 0,
            'has_autocorr_pct': has_autocorr / len(type_results) * 100 if len(type_results) > 0 else 0
        })

# Create comparison visualization
if comparison_results:
    # Convert to DataFrame
    comparison_df = pd.DataFrame(comparison_results)
    
    plt.figure(figsize=(12, 6))
    
    # Plot average efficiency score by election type
    bars = plt.bar(comparison_df['election_type'], comparison_df['avg_score'], color='skyblue')
    
    # Add count labels
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                f"n={comparison_df['count'].iloc[i]}",
                ha='center', va='bottom')
    
    plt.title('Average Efficiency Score by Election Type', fontsize=14)
    plt.ylabel('Efficiency Score (0-100)', fontsize=12)
    plt.ylim(0, 100)
    plt.grid(axis='y', alpha=0.3)
    plt.savefig(f"{results_dir}/efficiency_by_election_type.png", dpi=300, bbox_inches='tight')
    plt.show()

## Market Analysis
### 5.1 Single Market

In [None]:
def get_market_by_id_or_name(market_id_or_name, main_df, id_column, market_questions):
    """
    Find a market by its ID or name
    
    Parameters:
    -----------
    market_id_or_name : str or int
        Market ID or partial name to search for
    
    Returns:
    --------
    tuple
        (market_id, market_info) or (None, None) if not found
    """
    # Try direct ID match
    if isinstance(market_id_or_name, (int, float)) or market_id_or_name.isdigit():
        market_id = int(market_id_or_name) if market_id_or_name.isdigit() else market_id_or_name
        market_rows = main_df[main_df[id_column] == market_id]
        if not market_rows.empty:
            return market_id, market_rows.iloc[0]
    
    # Try string ID match
    market_rows = main_df[main_df[id_column].astype(str) == str(market_id_or_name)]
    if not market_rows.empty:
        return market_id_or_name, market_rows.iloc[0]
    
    # Try partial name match
    if 'question' in main_df.columns:
        name_matches = main_df[main_df['question'].str.contains(str(market_id_or_name), case=False, na=False)]
        if not name_matches.empty:
            match = name_matches.iloc[0]
            return match[id_column], match
    
    # Try partial match in market_questions values
    for id, question in market_questions.items():
        if str(market_id_or_name).lower() in question.lower():
            market_rows = main_df[main_df[id_column].astype(str) == str(id)]
            if not market_rows.empty:
                return id, market_rows.iloc[0]
    
    return None, None

def analyze_specific_market(market_id_or_name, main_df=None, id_column=None, market_questions=None):
    """
    Analyze a specific market identified by ID or name
    
    Parameters:
    -----------
    market_id_or_name : str or int
        Market ID or name to search for
    
    Returns:
    --------
    dict
        Market analysis results
    """
    if main_df is None:
        main_df, id_column, market_questions = load_data(verbose=False)
    
    # Find the market
    market_id, market_info = get_market_by_id_or_name(market_id_or_name, main_df, id_column, market_questions)
    
    if market_id is None:
        print(f"No market found matching '{market_id_or_name}'")
        return None
    
    # Display market information
    print("\n🔍 Market Information")
    print("-" * 50)
    print(f"Market ID: {market_id}")
    
    # Get the market question
    market_name = None
    if 'question' in market_info:
        market_name = market_info['question']
    else:
        market_name = market_questions.get(str(market_id), f"Market {market_id}")
    
    print(f"Market Question: {market_name}")
    
    # Display additional information if available
    for col, label in [
        ('event_electionType', 'Election Type'),
        ('event_country', 'Country'),
        ('volumeNum', 'Trading Volume'),
        ('market_duration_days', 'Market Duration (days)')
    ]:
        if col in market_info and not pd.isna(market_info[col]):
            print(f"{label}: {market_info[col]}")
    
    # Process the market data
    print("\nProcessing market data...")
    market_data = preprocess_market_data(market_id, verbose=True)
    
    if market_data is None:
        print("❌ Failed to process market data")
        return None
    
    print(f"✅ Successfully processed market data with {len(market_data)} time points")
    
    # Run efficiency tests
    print("\nRunning market efficiency tests...")
    test_results = run_efficiency_tests(market_data, verbose=True)
    
    if test_results is None:
        print("❌ Failed to run efficiency tests")
        return None
    
    # Calculate efficiency score
    efficiency_score = calculate_efficiency_score(test_results)
    
    # Determine efficiency class
    if efficiency_score >= 80:
        efficiency_class = 'Highly Efficient'
    elif efficiency_score >= 60:
        efficiency_class = 'Moderately Efficient'
    elif efficiency_score >= 40:
        efficiency_class = 'Slightly Inefficient'
    else:
        efficiency_class = 'Highly Inefficient'
    
    print(f"\n📊 Market Efficiency Score: {efficiency_score:.2f}/100")
    print(f"📈 Efficiency Classification: {efficiency_class}")
    
    # Print detailed test results
    print("\n🔬 Detailed Test Results:")
    
    if 'adf_price' in test_results:
        is_random_walk = not test_results['adf_price']['is_stationary']
        print(f"Random Walk Test (Non-stationary prices): {'✅ Pass' if is_random_walk else '❌ Fail'}")
    
    if 'adf_return' in test_results:
        is_return_stationary = test_results['adf_return']['is_stationary']
        print(f"Return Stationarity Test: {'✅ Pass' if is_return_stationary else '❌ Fail'}")
    
    if 'autocorrelation' in test_results:
        no_autocorr = not test_results['autocorrelation']['has_significant_autocorrelation']
        print(f"No Significant Autocorrelation: {'✅ Pass' if no_autocorr else '❌ Fail'}")
    
    if 'runs_test' in test_results:
        is_random = test_results['runs_test'].get('is_random', False)
        print(f"Runs Test (Randomness): {'✅ Pass' if is_random else '❌ Fail'}")
    
    if 'ar_model' in test_results:
        no_ar = not test_results['ar_model'].get('significant', True)
        print(f"No Significant AR Model: {'✅ Pass' if no_ar else '❌ Fail'}")
    
    # Create visualizations
    print("\nGenerating visualizations...")
    visualize_market(market_data, market_name, test_results)
    
    # Prepare final results
    results = {
        'market_id': market_id,
        'market_name': market_name,
        'test_results': test_results,
        'efficiency_score': efficiency_score,
        'efficiency_class': efficiency_class
    }
    
    # Add market attributes
    for col in ['event_electionType', 'event_country', 'volumeNum', 'market_duration_days']:
        if col in market_info and not pd.isna(market_info[col]):
            results[col] = market_info[col]
    
    return results

def visualize_market(market_data, market_name, test_results=None):
    """Create visualizations for a specific market"""
    # Create a 2x2 plot grid
    fig, axs = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Price Series
    axs[0, 0].plot(market_data.index, market_data['price'], linewidth=2)
    axs[0, 0].set_title(f'Price Series: {market_name}', fontsize=14)
    axs[0, 0].set_xlabel('Date', fontsize=12)
    axs[0, 0].set_ylabel('Price', fontsize=12)
    axs[0, 0].grid(True, alpha=0.3)
    
    # 2. Log Returns
    axs[0, 1].plot(market_data.index, market_data['log_return'], linewidth=1, color='green')
    axs[0, 1].set_title(f'Log Returns: {market_name}', fontsize=14)
    axs[0, 1].set_xlabel('Date', fontsize=12)
    axs[0, 1].set_ylabel('Log Return', fontsize=12)
    axs[0, 1].grid(True, alpha=0.3)
    
    # 3. ACF Plot
    if test_results and 'autocorrelation' in test_results:
        acf_values = test_results['autocorrelation']['acf_values']
        significant = test_results['autocorrelation']['has_significant_autocorrelation']
        
        lags = range(len(acf_values))
        axs[1, 0].bar(lags, acf_values, width=0.3)
        
        # Plot confidence intervals for hypothesis testing
        ci = 1.96 / np.sqrt(len(market_data))
        axs[1, 0].axhline(y=0, linestyle='-', color='black', linewidth=1)
        axs[1, 0].axhline(y=ci, linestyle='--', color='red', linewidth=1, alpha=0.7)
        axs[1, 0].axhline(y=-ci, linestyle='--', color='red', linewidth=1, alpha=0.7)
        
        title = f'Autocorrelation Function: {"❌ Significant" if significant else "✅ Not Significant"}'
        axs[1, 0].set_title(title, fontsize=14)
        axs[1, 0].set_xlabel('Lag', fontsize=12)
        axs[1, 0].set_ylabel('ACF', fontsize=12)
    else:
        axs[1, 0].set_title('Autocorrelation Function: Not Available', fontsize=14)
    
    # 4. Price distribution
    axs[1, 1].hist(market_data['price'], bins=30, alpha=0.7, density=True)
    axs[1, 1].set_title(f'Price Distribution: {market_name}', fontsize=14)
    axs[1, 1].set_xlabel('Price', fontsize=12)
    axs[1, 1].set_ylabel('Density', fontsize=12)
    
    plt.tight_layout()
    plt.show()
    
    # If time-varying efficiency results are available, show those too
    if test_results and 'time_varying' in test_results and 'comparison' in test_results['time_varying']:
        comparison = test_results['time_varying']['comparison']
        
        plt.figure(figsize=(10, 6))
        periods = ['Early', 'Middle', 'Late']
        
        # Extract volatility for each period
        volatilities = []
        for period in ['early', 'middle', 'late']:
            if period in test_results['time_varying']:
                volatilities.append(test_results['time_varying'][period]['volatility'])
            else:
                volatilities.append(np.nan)
        
        # Create the bar chart
        bars = plt.bar(periods, volatilities, color=['blue', 'green', 'orange'])
        
        plt.title(f'Return Volatility by Market Period: {comparison["efficiency_change"]}', fontsize=14)
        plt.ylabel('Return Volatility', fontsize=12)
        plt.grid(axis='y', alpha=0.3)
        
        # Add efficiency change information
        plt.figtext(0.5, 0.01, f'Efficiency Change: {comparison["efficiency_change"]}', 
                   ha='center', fontsize=12, bbox={"facecolor":"lightgray", "alpha":0.5, "pad":5})
        
        plt.tight_layout()
        plt.show()

## 6. Cross-Market Analysis

Let's analyze how markets in the same event influence each other.

In [None]:
# Find a presidential election event with multiple markets
presidential_events = analyzer.find_market_by_name("Presidential Election 2024")

if presidential_events:
    # Get the first event ID
    event_id = presidential_events[0][0]
    print(f"Analyzing cross-market relationships for event: {presidential_events[0][1]}")
    
    # Run cross-market analysis
    cross_results = analyzer.analyze_cross_market(event_id)
    
    # Print significant relationships
    if cross_results and 'market_pairs' in cross_results:
        significant_pairs = [pair for pair in cross_results['market_pairs'] 
                            if pair['i_causes_j'] or pair['j_causes_i']]
        
        print(f"\nFound {len(significant_pairs)} significant relationships out of {len(cross_results['market_pairs'])} tested pairs")
        
        if significant_pairs:
            print("\nSignificant market relationships:")
            for pair in significant_pairs:
                print(f"Relationship: {pair['relationship']}")
                print(f"  Market 1: {pair['market_i_question']}")
                print(f"  Market 2: {pair['market_j_question']}")
                print()
    else:
        print("No significant cross-market relationships found.")
else:
    print("No presidential election events found.")

Error getting sample market IDs: [Errno 2] No such file or directory: 'data/trades/market_id_to_question.json'
Error running analysis: [Errno 2] No such file or directory: 'data/trades/market_id_to_question.json'
Analysis results not available

Basic Trader Concentration Analysis:
Trader Count Gini Coefficient: 0.7193
Trading Volume Gini Coefficient: 0.9034


## 7. Analyzing Time-Varying Efficiency

Let's look at how efficiency changes over time within markets.

In [None]:
# Select a long-running market to analyze time-varying efficiency
if 'market_duration_days' in analyzer.main_df.columns:
    long_markets = analyzer.main_df.sort_values('market_duration_days', ascending=False)
    
    if not long_markets.empty:
        long_market_id = long_markets.iloc[0][analyzer.id_column]
        long_market_question = analyzer.get_market_details(long_market_id)['question']
        
        print(f"Analyzing time-varying efficiency for market: {long_market_question}")
        
        # Run analysis
        time_result = analyzer.analyze_market(long_market_id, verbose=True)
        
        # Display time-varying efficiency results
        if 'time_varying' in time_result and 'comparison' in time_result['time_varying']:
            comparison = time_result['time_varying']['comparison']
            
            print("\nTime-Varying Efficiency Results:")
            print(f"Efficiency Change: {comparison['efficiency_change']}")
            print(f"Volatility Ratio (Late/Early): {comparison['volatility_ratio']:.2f}")
            
            # Create visualization of efficiency over time
            periods = ['early', 'middle', 'late']
            periods_data = []
            
            for period in periods:
                if period in time_result['time_varying']:
                    period_data = time_result['time_varying'][period]
                    periods_data.append({
                        'period': period.capitalize(),
                        'volatility': period_data['return_volatility'],
                        'has_autocorrelation': period_data['significant_acf'],
                        'is_inefficient': period_data['ar_model']['significant'] if period_data['ar_model'] else False
                    })
            
            if periods_data:
                periods_df = pd.DataFrame(periods_data)
                
                plt.figure(figsize=(12, 6))
                
                # Plot volatility by period
                plt.subplot(1, 2, 1)
                plt.bar(periods_df['period'], periods_df['volatility'], color='skyblue')
                plt.title('Return Volatility by Market Period', fontsize=14)
                plt.ylabel('Volatility', fontsize=12)
                
                # Plot inefficiency by period
                plt.subplot(1, 2, 2)
                inefficiency_scores = [float(row['is_inefficient']) + float(row['has_autocorrelation']) for _, row in periods_df.iterrows()]
                plt.bar(periods_df['period'], inefficiency_scores, color='salmon')
                plt.title('Inefficiency Score by Market Period', fontsize=14)
                plt.ylabel('Inefficiency Score (0-2)', fontsize=12)
                plt.ylim(0, 2)
                
                plt.tight_layout()
                plt.savefig(f"{results_dir}/time_varying_efficiency.png", dpi=300, bbox_inches='tight')
                plt.show()
        else:
            print("No time-varying efficiency data available for this market.")

### 5.3 Visualize Results

In [None]:
def visualize_efficiency_results(results_df, save_dir=None):
    """
    Create visualizations for market efficiency results
    
    Parameters:
    -----------
    results_df : pd.DataFrame
        DataFrame with market efficiency results
    save_dir : str, optional
        Directory to save plots, if None uses results_dir
    """
    if results_df is None or len(results_df) == 0:
        print("No results to visualize")
        return
    
    if save_dir is None:
        save_dir = results_dir
    
    os.makedirs(save_dir, exist_ok=True)
    
    # 1. Efficiency Score Distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(results_df['efficiency_score'], bins=20, kde=True)
    plt.axvline(x=results_df['efficiency_score'].mean(), color='red', linestyle='--', 
               label=f'Mean: {results_df["efficiency_score"].mean():.2f}')
    plt.title('Distribution of Market Efficiency Scores', fontsize=14)
    plt.xlabel('Efficiency Score (0-100, higher = more efficient)', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.grid(alpha=0.3)
    plt.legend()
    
    plt.savefig(os.path.join(save_dir, 'efficiency_score_distribution.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Efficiency Classification Pie Chart
    plt.figure(figsize=(10, 8))
    results_df['efficiency_class'].value_counts().plot.pie(autopct='%1.1f%%', 
                                                         colors=sns.color_palette("viridis", 4),
                                                         startangle=90)
    plt.title('Market Efficiency Classification', fontsize=14)
    plt.ylabel('')  # Hide ylabel
    
    plt.savefig(os.path.join(save_dir, 'efficiency_classification_pie.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    # 3. Efficiency by Market Type (if available)
    if 'event_electionType' in results_df.columns:
        type_counts = results_df['event_electionType'].value_counts()
        types_with_data = type_counts[type_counts >= 5].index.tolist()
        
        if types_with_data:
            plt.figure(figsize=(12, 6))
            
            # Calculate average efficiency by type
            type_data = []
            for market_type in types_with_data:
                type_df = results_df[results_df['event_electionType'] == market_type]
                type_data.append({
                    'Market Type': market_type,
                    'Average Efficiency': type_df['efficiency_score'].mean(),
                    'Count': len(type_df)
                })
            
            type_df = pd.DataFrame(type_data).sort_values('Average Efficiency', ascending=False)
            
            # Create bar chart
            bars = plt.bar(type_df['Market Type'], type_df['Average Efficiency'], color='lightgreen')
            
            # Add count labels
            for i, bar in enumerate(bars):
                plt.text(bar.get_x() + bar.get_width()/2, 
                        bar.get_height() + 1, 
                        f"n={type_df['Count'].iloc[i]}", 
                        ha='center', va='bottom', fontsize=10)
            
            plt.axhline(y=results_df['efficiency_score'].mean(), color='red', linestyle='--', 
                       label=f'Overall Average: {results_df["efficiency_score"].mean():.2f}')
            
            plt.title('Average Efficiency Score by Market Type', fontsize=14)
            plt.xlabel('Market Type', fontsize=12)
            plt.ylabel('Average Efficiency Score', fontsize=12)
            plt.xticks(rotation=45, ha='right')
            plt.legend()
            plt.ylim(0, 100)
            plt.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            
            plt.savefig(os.path.join(save_dir, 'efficiency_by_market_type.png'), dpi=300, bbox_inches='tight')
            plt.show()
    
    # 4. Efficiency by Country (if available)
    if 'event_country' in results_df.columns:
        country_counts = results_df['event_country'].value_counts()
        countries_with_data = country_counts[country_counts >= 5].index.tolist()
        
        if countries_with_data:
            plt.figure(figsize=(12, 6))
            
            country_data = []
            for country in countries_with_data:
                country_df = results_df[results_df['event_country'] == country]
                country_data.append({
                    'Country': country,
                    'Average Efficiency': country_df['efficiency_score'].mean(),
                    'Count': len(country_df)
                })
            
            country_df = pd.DataFrame(country_data).sort_values('Average Efficiency', ascending=False)
            
            bars = plt.bar(country_df['Country'], country_df['Average Efficiency'], color='skyblue')
            
            for i, bar in enumerate(bars):
                plt.text(bar.get_x() + bar.get_width()/2, 
                        bar.get_height() + 1, 
                        f"n={country_df['Count'].iloc[i]}", 
                        ha='center', va='bottom', fontsize=10)
            
            plt.axhline(y=results_df['efficiency_score'].mean(), color='red', linestyle='--', 
                       label=f'Overall Average: {results_df["efficiency_score"].mean():.2f}')
            
            plt.title('Average Efficiency Score by Country', fontsize=14)
            plt.xlabel('Country', fontsize=12)
            plt.ylabel('Average Efficiency Score', fontsize=12)
            plt.xticks(rotation=45, ha='right')
            plt.legend()
            plt.ylim(0, 100)
            plt.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            
            plt.savefig(os.path.join(save_dir, 'efficiency_by_country.png'), dpi=300, bbox_inches='tight')
            plt.show()
    
    # 5. Efficiency vs Volume (if available)
    if 'volumeNum' in results_df.columns:
        plt.figure(figsize=(10, 6))
        
        # Use log scale for volume
        plt.scatter(results_df['volumeNum'], results_df['efficiency_score'], alpha=0.6)
        plt.xscale('log')
        
        # Add trend line
        try:
            z = np.polyfit(np.log10(results_df['volumeNum']), results_df['efficiency_score'], 1)
            p = np.poly1d(z)
            
            # Create x range for line (in log space)
            x_range = np.logspace(
                np.log10(results_df['volumeNum'].min()), 
                np.log10(results_df['volumeNum'].max()), 
                100
            )
            
            plt.plot(x_range, p(np.log10(x_range)), "r--", linewidth=2)
            
            # Calculate correlation
            corr = np.corrcoef(np.log10(results_df['volumeNum']), results_df['efficiency_score'])[0, 1]
            plt.text(0.05, 0.95, f"Correlation: {corr:.3f}", transform=plt.gca().transAxes,
                    bbox=dict(facecolor='white', alpha=0.8))
        except:
            pass
        
        plt.title('Efficiency Score vs Trading Volume', fontsize=14)
        plt.xlabel('Trading Volume (log scale)', fontsize=12)
        plt.ylabel('Efficiency Score', fontsize=12)
        plt.grid(alpha=0.3)
        
        plt.savefig(os.path.join(save_dir, 'efficiency_vs_volume.png'), dpi=300, bbox_inches='tight')
        plt.show()
    
    # 6. Time-varying efficiency results
    if 'efficiency_change' in results_df.columns:
        efficiency_changes = results_df['efficiency_change'].value_counts()
        
        plt.figure(figsize=(10, 6))
        bars = plt.bar(efficiency_changes.index, efficiency_changes.values, color=['green', 'gray', 'red'])
        
        # Add percentage labels
        total = len(results_df)
        for i, (category, count) in enumerate(efficiency_changes.items()):
            plt.text(i, count + 0.5, f"{count/total*100:.1f}%", ha='center', fontsize=12)
        
        plt.title('Efficiency Change Over Market Lifecycle', fontsize=14)
        plt.ylabel('Number of Markets', fontsize=12)
        plt.grid(axis='y', alpha=0.3)
        
        plt.savefig(os.path.join(save_dir, 'time_varying_efficiency.png'), dpi=300, bbox_inches='tight')
        plt.show()

## 8. Comprehensive Batch Analysis

Now let's run a larger batch analysis to get a comprehensive view of market efficiency.

In [None]:
# Configure batch analysis parameters
batch_size = 30  # Number of markets to analyze
parallel = True  # Use parallel processing

# Get markets for analysis
markets_to_analyze = analyzer.get_top_markets_by_volume(n=batch_size)

print(f"Running comprehensive analysis on {len(markets_to_analyze)} markets...")

# Run batch analysis
batch_results = analyzer.analyze_market_batch(markets_to_analyze, parallel=parallel)

# Generate summary
if batch_results:
    summary = analyzer.summarize_results(batch_results)
    
    # Visualize summary
    analyzer.visualize_summary(summary)
    
    # Save results
    analyzer.save_results(batch_results, "comprehensive_batch_results.json")
    analyzer.save_results(summary, "comprehensive_batch_summary.json")
    
    # Display key findings
    print("\nKey Findings:")
    print(f"Total markets analyzed: {summary['total_markets']}")
    print(f"Average efficiency score: {summary['average_efficiency_score']:.2f}")
    
    print("\nEfficiency Classes:")
    for cls, count in summary['efficiency_classes'].items():
        percentage = count / summary['total_markets'] * 100
        print(f"  {cls}: {count} markets ({percentage:.1f}%)")
    
    print("\nTest Results:")
    test_results = summary['test_results']
    print(f"  Non-stationary prices: {100 - test_results.get('price_stationary_percentage', 0):.1f}% (efficient)")
    print(f"  Stationary returns: {test_results.get('return_stationary_percentage', 0):.1f}% (efficient)")
    print(f"  No significant autocorrelation: {100 - test_results.get('has_autocorrelation_percentage', 0):.1f}% (efficient)")
    print(f"  Random runs test: {test_results.get('is_random_percentage', 0):.1f}% (efficient)")
    print(f"  No significant AR model: {100 - test_results.get('ar_significant_percentage', 0):.1f}% (efficient)")

## 9. Conclusion

In this notebook, we have analyzed the efficiency of Polymarket prediction markets using various statistical tests. We have found that:

1. Overall market efficiency varies by market type, with [highest/lowest] efficiency observed in [type] markets.
2. Price dynamics exhibit [characteristics] which [support/challenge] the efficient market hypothesis.
3. Cross-market relationships show [patterns] of information flow between related markets.
4. Time-varying efficiency analysis reveals [patterns] as markets progress.

These findings contribute to understanding how prediction markets aggregate information and their effectiveness as forecasting tools.