In [None]:
# Market Efficiency Analysis
# -------------------------
# This notebook provides tools for analyzing the efficiency of prediction markets.

import os
import sys
sys.path.append(os.path.abspath('.'))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import acf, pacf, adfuller
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import warnings
from tqdm.notebook import tqdm

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Suppress warnings
warnings.filterwarnings('ignore')

# Add the src directory to the path if it isn't already there
if '../src' not in sys.path:
    sys.path.append('../src')

# Import utility functions
from src.utils.data_loader import load_main_dataset, load_trade_data, get_sample_market_ids

# Load the main dataset
print("Loading main dataset...")
main_df = load_main_dataset('data/cleaned_election_data.csv')
print(f"Loaded dataset with {main_df.shape[0]} rows and {main_df.shape[1]} columns")

# Check column names
print("\nColumn names in the dataset:")
print(main_df.columns.tolist())

# Determine ID column
id_column = None
if 'market_id' in main_df.columns:
    id_column = 'market_id'
elif 'id' in main_df.columns:
    id_column = 'id'
else:
    # Use the first column as ID
    id_column = main_df.columns[0]
    print(f"Using {id_column} as the ID column")

# Display some sample data
print("\nSample data:")
display(main_df.head())

# Get a list of market IDs for analysis
sort_column = 'volumeNum' if 'volumeNum' in main_df.columns else id_column
sample_markets = main_df.sort_values(sort_column, ascending=False)[id_column].unique()[:10]
print(f"\nSelected {len(sample_markets)} markets for analysis")
print(sample_markets)


ModuleNotFoundError: No module named 'src'

In [None]:

# Function to preprocess market data
def preprocess_market_data(market_id, resample='1min'):
    """
    Convert raw trade data to time series of prices and returns.
    
    Parameters:
    -----------
    market_id : str
        The ID of the market to analyze
    resample : str
        Frequency to resample the time series (default: '1min')
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: timestamp, price, log_return
    """
    # Load trade data for the specific market
    trades_df = load_trade_data(market_id, trades_dir="data/trades")
    
    if trades_df is None or len(trades_df) < 30:
        print(f"Insufficient trade data for market {market_id}")
        return None
    
    # Ensure timestamp is a datetime type
    if not pd.api.types.is_datetime64_any_dtype(trades_df['timestamp']):
        trades_df['timestamp'] = pd.to_datetime(trades_df['timestamp'])
    
    # Sort by timestamp
    trades_df = trades_df.sort_values('timestamp')
    
    # Ensure price is numeric
    if 'price' in trades_df.columns:
        trades_df['price'] = pd.to_numeric(trades_df['price'], errors='coerce')
    elif 'price_num' in trades_df.columns:
        trades_df['price'] = pd.to_numeric(trades_df['price_num'], errors='coerce')
    else:
        print(f"No price column found for market {market_id}")
        return None
    
    # Drop rows with NaN prices
    trades_df = trades_df.dropna(subset=['price'])
    
    # Resample to regular intervals
    trades_df = trades_df.set_index('timestamp')
    price_series = trades_df['price'].resample(resample).last()
    
    # Fill missing values using forward fill
    price_series = price_series.ffill()
    
    # Calculate log returns
    log_returns = np.log(price_series / price_series.shift(1))
    
    # Create DataFrame
    result_df = pd.DataFrame({
        'price': price_series,
        'log_return': log_returns
    })
    
    # Drop rows with NaN
    result_df = result_df.dropna()
    
    return result_df

# Test the preprocessing function on one market
test_market_id = sample_markets[0]
print(f"\nTesting preprocessing on market {test_market_id}")
market_data = preprocess_market_data(test_market_id)

if market_data is not None:
    print(f"Successfully processed market data with {len(market_data)} rows")
    display(market_data.head())
    
    # Plot price series
    plt.figure(figsize=(12, 6))
    plt.plot(market_data.index, market_data['price'])
    plt.title(f'Price Series for Market {test_market_id}')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Plot return series
    plt.figure(figsize=(12, 6))
    plt.plot(market_data.index, market_data['log_return'])
    plt.title(f'Log Return Series for Market {test_market_id}')
    plt.xlabel('Date')
    plt.ylabel('Log Return')
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("Failed to process market data. Let's try another market.")
    if len(sample_markets) > 1:
        test_market_id = sample_markets[1]
        print(f"Trying market {test_market_id}")
        market_data = preprocess_market_data(test_market_id)
        if market_data is not None:
            print(f"Successfully processed market data with {len(market_data)} rows")
            display(market_data.head())

# Function to run autocorrelation tests
def run_autocorrelation_tests(returns, lags=[60, 360, 1440]):
    """
    Run ACF/PACF tests on return series.
    
    Parameters:
    -----------
    returns : pd.Series
        Series of log returns
    lags : list
        List of lag periods to test (in minutes)
        
    Returns:
    --------
    dict
        Dictionary with ACF/PACF results and significance
    """
    results = {}
    
    for lag in lags:
        # Limit lag to length of series
        effective_lag = min(lag, len(returns) - 1)
        
        if effective_lag < 5:  # Skip if too few observations
            continue
            
        # Calculate ACF and PACF
        acf_values = acf(returns, nlags=effective_lag, fft=True)
        pacf_values = pacf(returns, nlags=effective_lag)
        
        # Create result entry
        lag_key = f"{effective_lag}min"
        results[lag_key] = {
            'acf': acf_values.tolist(),
            'pacf': pacf_values.tolist()
        }
        
        # Create plots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        plot_acf(returns, lags=effective_lag, ax=ax1, title=f'ACF - Lag {lag_key}')
        plot_pacf(returns, lags=effective_lag, ax=ax2, title=f'PACF - Lag {lag_key}')
        plt.tight_layout()
        plt.show()
    
    return results

# Function to run ADF test
def run_adf_test(series, series_type='price'):
    """
    Run Augmented Dickey-Fuller test for unit root.
    
    Parameters:
    -----------
    series : pd.Series
        Time series to test
    series_type : str
        Type of series ('price' or 'return')
        
    Returns:
    --------
    dict
        Dictionary with test results
    """
    # Run ADF test
    result = adfuller(series.dropna())
    
    # Format results
    adf_result = {
        'adf_statistic': result[0],
        'pvalue': result[1],
        'critical_values': result[4],
        'is_stationary': result[1] < 0.05  # Reject unit root if p-value < 0.05
    }
    
    return adf_result

# Test autocorrelation and ADF test on the market data
if market_data is not None and len(market_data) > 60:
    print("\nRunning autocorrelation tests...")
    acf_results = run_autocorrelation_tests(market_data['log_return'], lags=[30, 60])
    
    print("\nRunning ADF test on price series...")
    adf_price = run_adf_test(market_data['price'], 'price')
    print("ADF test on price series:")
    print(f"ADF Statistic: {adf_price['adf_statistic']:.4f}")
    print(f"p-value: {adf_price['pvalue']:.4f}")
    print(f"Is stationary: {adf_price['is_stationary']}")
    
    print("\nRunning ADF test on return series...")
    adf_return = run_adf_test(market_data['log_return'], 'return')
    print("ADF test on return series:")
    print(f"ADF Statistic: {adf_return['adf_statistic']:.4f}")
    print(f"p-value: {adf_return['pvalue']:.4f}")
    print(f"Is stationary: {adf_return['is_stationary']}")
else:
    print("Not enough data to run time series tests")

# Function to run a simplified efficiency analysis on a market
def analyze_market_efficiency(market_id):
    """
    Run a simplified market efficiency analysis on a single market.
    
    Parameters:
    -----------
    market_id : str
        ID of the market to analyze
        
    Returns:
    --------
    dict
        Dictionary with efficiency results
    """
    result = {'market_id': market_id}
    
    # Preprocess market data
    market_data = preprocess_market_data(market_id)
    if market_data is None or len(market_data) < 30:
        return None
    
    # Run ADF tests
    result['adf_price'] = run_adf_test(market_data['price'], 'price')
    result['adf_return'] = run_adf_test(market_data['log_return'], 'return')
    
    # Calculate summary statistics
    result['price_mean'] = market_data['price'].mean()
    result['price_std'] = market_data['price'].std()
    result['price_range'] = market_data['price'].max() - market_data['price'].min()
    result['return_mean'] = market_data['log_return'].mean()
    result['return_std'] = market_data['log_return'].std()
    
    # Check for autocorrelation (simplified)
    acf_values = acf(market_data['log_return'], nlags=10, fft=True)
    result['significant_autocorrelation'] = any(abs(acf_values[1:]) > 1.96 / np.sqrt(len(market_data)))
    
    # Fit AR(1) model
    if len(market_data) > 10:
        try:
            model = AutoReg(market_data['log_return'], lags=1)
            model_fit = model.fit()
            
            result['ar1_coefficient'] = model_fit.params[1] if len(model_fit.params) > 1 else 0
            result['ar1_pvalue'] = model_fit.pvalues[1] if len(model_fit.pvalues) > 1 else 1
            result['ar1_significant'] = result['ar1_pvalue'] < 0.05
        except Exception as e:
            print(f"Error fitting AR model for market {market_id}: {e}")
            result['ar1_coefficient'] = 0
            result['ar1_pvalue'] = 1
            result['ar1_significant'] = False
    
    return result

# Run the analysis on sample markets
print("\nRunning simplified efficiency analysis on sample markets...")
efficiency_results = []

for market_id in tqdm(sample_markets[:5]):  # Limit to first 5 markets for quick testing
    result = analyze_market_efficiency(market_id)
    if result:
        efficiency_results.append(result)

# Display results
if efficiency_results:
    efficiency_df = pd.DataFrame(efficiency_results)
    print("\nEfficiency analysis results:")
    display(efficiency_df)
    
    # Summary of efficiency metrics
    efficiency_summary = {
        'markets_analyzed': len(efficiency_df),
        'price_non_stationary': sum(~efficiency_df['adf_price'].apply(lambda x: x['is_stationary'])),
        'return_stationary': sum(efficiency_df['adf_return'].apply(lambda x: x['is_stationary'])),
        'significant_autocorrelation': sum(efficiency_df['significant_autocorrelation']),
        'significant_ar1': sum(efficiency_df['ar1_significant'])
    }
    
    print("\nSummary of efficiency metrics:")
    for key, value in efficiency_summary.items():
        print(f"{key}: {value}")
    
    # Calculate percentages
    if len(efficiency_df) > 0:
        print("\nPercentages:")
        print(f"Non-stationary price series: {efficiency_summary['price_non_stationary'] / len(efficiency_df) * 100:.1f}%")
        print(f"Stationary return series: {efficiency_summary['return_stationary'] / len(efficiency_df) * 100:.1f}%")
        print(f"Significant autocorrelation: {efficiency_summary['significant_autocorrelation'] / len(efficiency_df) * 100:.1f}%")
        print(f"Significant AR(1) coefficient: {efficiency_summary['significant_ar1'] / len(efficiency_df) * 100:.1f}%")
else:
    print("No efficiency results available")