# 1. Configuration

In [172]:
# Standard library imports
import os
import sys
import json
from datetime import datetime
from typing import Dict, List, Optional, Union, Tuple

# Data processing imports
import numpy as np
import pandas as pd

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning imports
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Optional imports - handle gracefully if not available
try:
    from scipy import stats
    SCIPY_AVAILABLE = True
except ImportError:
    SCIPY_AVAILABLE = False

# Add parent directory to path for importing utilities
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir) if current_dir.endswith('notebooks') else current_dir
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Import utility functions from the project
try:
    from src.utils.data_loader import (
        load_main_dataset, 
        load_trade_data, 
        get_token_ids_for_market,
        find_token_id_file
    )
except ImportError:
    print("Warning: Could not import data_loader utilities. Some functions may not work.")

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [173]:
# Market Selection Configuration
MARKET_SELECTION = {
    'by_name': [
        "Will Donald Trump win the 2024 US Presidential Election?",
        "Will Kamala Harris win the 2024 US Presidential Election?"
    ],
    'by_id': [],  # Add specific market IDs here if needed
    'top_n_by_volume': 0,  # Set to a number > 0 to analyze top N markets by volume
    'min_volume': 0,  # Minimum volume threshold
    'date_range': None,  # Set to (start_date, end_date) to filter by date
}

# Analysis Configuration
ANALYSIS_CONFIG = {
    'run_trader_distribution': True,
    'run_whale_identification': True,
    'run_trading_inequality': True,
    'run_trader_classification': True,
    'run_market_dynamics': True,
    'whale_threshold': 0.01,  # Top 1% traders by volume are considered whales
    'trader_clusters': 5,  # Number of trader clusters for classification
    'save_results': True,  # Whether to save results to files
    'results_dir': 'results/trader_analysis',
    'generate_plots': True  # Whether to generate plots
}

# 2. Data Loading

In [174]:
def load_market_data(market_config):
    """
    Load market data based on configuration
    
    Parameters:
    -----------
    market_config : dict
        Dictionary with market selection parameters
        
    Returns:
    --------
    tuple
        (market_data, trade_data) - DataFrames with market and trade data
    """
    print("Loading main dataset...")
    market_data = load_main_dataset('data/cleaned_election_data.csv')
    
    if market_data is None:
        print("Failed to load market data")
        return None, None
    
    # Filter markets based on configuration
    selected_markets = market_data.copy()
    
    # Filter by name if specified
    if market_config['by_name'] and len(market_config['by_name']) > 0:
        selected_markets = selected_markets[selected_markets['question'].isin(market_config['by_name'])]
        print(f"Selected {len(selected_markets)} markets by name")
    
    # Filter by ID if specified
    if market_config['by_id'] and len(market_config['by_id']) > 0:
        id_filter = selected_markets['id'].isin(market_config['by_id'])
        if len(selected_markets) > 0:
            selected_markets = selected_markets[id_filter]
        else:
            selected_markets = market_data[id_filter]
        print(f"Selected {len(selected_markets)} markets by ID")
    
    # Filter by top N by volume
    if market_config['top_n_by_volume'] > 0:
        if 'volumeNum' in market_data.columns:
            top_markets = market_data.sort_values('volumeNum', ascending=False).head(
                market_config['top_n_by_volume'])
            
            if len(selected_markets) > 0:
                # Intersect with already selected markets
                selected_markets = selected_markets[selected_markets['id'].isin(top_markets['id'])]
            else:
                selected_markets = top_markets
                
            print(f"Selected {len(selected_markets)} top markets by volume")
    
    # Apply minimum volume filter if specified
    if market_config['min_volume'] > 0 and 'volumeNum' in market_data.columns:
        volume_filter = selected_markets['volumeNum'] >= market_config['min_volume']
        selected_markets = selected_markets[volume_filter]
        print(f"Selected {len(selected_markets)} markets with minimum volume {market_config['min_volume']}")
    
    # Apply date range filter if specified
    if market_config['date_range'] is not None and len(market_config['date_range']) == 2:
        start_date, end_date = market_config['date_range']
        if 'market_start_date' in selected_markets.columns and 'market_end_date' in selected_markets.columns:
            date_filter = (
                (selected_markets['market_start_date'] >= start_date) & 
                (selected_markets['market_end_date'] <= end_date)
            )
            selected_markets = selected_markets[date_filter]
            print(f"Selected {len(selected_markets)} markets in date range {start_date} to {end_date}")
    
    # If no markets were selected, use default selection
    if len(selected_markets) == 0:
        print("No markets matched selection criteria. Using default selection.")
        if market_config['top_n_by_volume'] > 0:
            selected_markets = market_data.sort_values('volumeNum', ascending=False).head(
                market_config['top_n_by_volume'])
        else:
            selected_markets = market_data.head(2)  # Default to first 2 markets
    
    print(f"Final selection: {len(selected_markets)} markets")
    
    # Display selected markets
    if len(selected_markets) > 0:
        print("\nSelected Markets:")
        for i, (idx, row) in enumerate(selected_markets.iterrows()):
            market_name = row['question'] if 'question' in row else f"Market {row['id']}"
            print(f"{i+1}. {market_name} (ID: {row['id']})")
    
    # Load trade data for selected markets
    market_ids = selected_markets['id'].tolist()
    trade_data = load_trade_data_for_analysis(market_ids=market_ids)
    
    return selected_markets, trade_data

# 3. Data Processing

In [175]:
def preprocess_trade_data(trade_data):
    """
    Preprocess trade data for analysis
    
    Parameters:
    -----------
    trade_data : pd.DataFrame
        Raw trade data
        
    Returns:
    --------
    pd.DataFrame
        Cleaned and preprocessed trade data
    """
    print("\n" + "="*80)
    print("DATA PREPROCESSING")
    print("="*80)
    
    if trade_data is None or len(trade_data) == 0:
        print("No trade data to preprocess")
        return None
    
    # Create a copy to avoid modifying the original
    df = trade_data.copy()
    
    # 1. Handle missing values
    initial_rows = len(df)
    print(f"Initial rows: {initial_rows:,}")
    
    # Check for missing values in key columns
    missing_values = df.isnull().sum()
    print("\nMissing values in key columns:")
    for col, missing in missing_values.items():
        if missing > 0:
            print(f"  {col}: {missing:,} ({missing/len(df)*100:.1f}%)")
    
    # Drop rows with missing critical values
    critical_columns = ['trader_id']
    if any(col in df.columns for col in critical_columns):
        present_critical = [col for col in critical_columns if col in df.columns]
        df = df.dropna(subset=present_critical)
        print(f"Rows after dropping missing critical values: {len(df):,}")
    
    # 2. Handle timestamps
    if 'timestamp' in df.columns:
        if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
            print("Converting timestamps to datetime...")
            try:
                df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
                df = df.dropna(subset=['timestamp'])
                print(f"Converted {len(df):,} timestamps")
            except Exception as e:
                print(f"Error converting timestamps: {e}")
                # Create a sequential index if conversion fails
                print("Creating sequential timestamps instead")
                df = df.sort_index()
                df['timestamp'] = pd.Series(range(len(df)))
        
        # Sort by timestamp
        df = df.sort_values('timestamp')
    
    # 3. Normalize trader IDs
    # Ensure consistent trader ID format
    if 'maker_id' in df.columns and 'trader_id' not in df.columns:
        df['trader_id'] = df['maker_id']
        print("Created trader_id from maker_id")
    elif 'maker' in df.columns and 'trader_id' not in df.columns:
        df['trader_id'] = df['maker']
        print("Created trader_id from maker column")
    
    # Check if there's a trader_id column now
    if 'trader_id' not in df.columns:
        print("Warning: No trader_id column available")
    else:
        # Convert trader_id to string type for consistency
        df['trader_id'] = df['trader_id'].astype(str)
        unique_traders = df['trader_id'].nunique()
        print(f"Unique traders identified: {unique_traders:,}")
    
    # 4. Normalize trade amounts
    # Check if we need to scale trade amounts
    if 'trade_amount' in df.columns:
        # Check if values are extremely large (likely in base units)
        median_value = df['trade_amount'].median()
        
        if median_value > 10000:  # Threshold suggesting base units
            scaling_factor = 1e6  # Standard scaling for USDC/USD
            print(f"Scaling trade_amount by factor of {scaling_factor:,.0f}")
            
            # Store original values
            df['trade_amount_original'] = df['trade_amount']
            
            # Scale values
            df['trade_amount'] = df['trade_amount'] / scaling_factor
    elif 'size' in df.columns and 'trade_amount' not in df.columns:
        # Convert size to numeric if needed
        df['size'] = pd.to_numeric(df['size'], errors='coerce')
        
        # Check if values are extremely large
        median_value = df['size'].median()
        
        if median_value > 10000:  # Threshold suggesting base units
            scaling_factor = 1e6  # Standard scaling for USDC/USD
            print(f"Creating trade_amount from size with scaling factor of {scaling_factor:,.0f}")
            
            # Create scaled trade_amount
            df['trade_amount'] = df['size'] / scaling_factor
        else:
            # Use size directly
            print("Creating trade_amount from size (no scaling needed)")
            df['trade_amount'] = df['size']
    else:
        print("Warning: No trade_amount or size column available")
        # Create a default trade_amount column if needed
        df['trade_amount'] = 1.0
        print("Created default trade_amount column with value 1.0")
    
    # 5. Add price change column if price exists
    if 'price' in df.columns:
        # Convert price to numeric
        df['price'] = pd.to_numeric(df['price'], errors='coerce')
        
        # Calculate price changes
        df['price_change'] = df['price'].diff()
        
        # Calculate summary statistics
        print("\nPrice statistics:")
        print(f"  Min: {df['price'].min():.6f}")
        print(f"  Max: {df['price'].max():.6f}")
        print(f"  Mean: {df['price'].mean():.6f}")
        print(f"  Std Dev: {df['price'].std():.6f}")
    
    # Print summary of preprocessing
    print("\nPreprocessing complete:")
    print(f"Initial rows: {initial_rows:,}")
    print(f"Final rows: {len(df):,}")
    print(f"Dropped rows: {initial_rows - len(df):,} ({(initial_rows - len(df))/initial_rows*100:.1f}%)")
    
    return df

# 4. Analysis Functions

## a. Trader Distribution

In [176]:
def analyze_trader_distribution(trade_data, config, save_prefix='trader_distribution'):
    """
    Analyze trader distribution patterns
    
    Parameters:
    -----------
    trade_data : pd.DataFrame
        DataFrame with trade data
    config : dict
        Analysis configuration
    save_prefix : str
        Prefix for saved files
        
    Returns:
    --------
    dict
        Dictionary with analysis results
    """
    print("\n" + "="*80)
    print("TRADER DISTRIBUTION ANALYSIS")
    print("="*80)
    
    if trade_data is None or len(trade_data) == 0:
        print("No trade data available for analysis")
        return None
    
    results_dir = config['results_dir']
    os.makedirs(results_dir, exist_ok=True)
    
    # Calculate trader-level metrics
    trader_metrics = trade_data.groupby('trader_id').agg({
        'trade_amount': ['sum', 'mean', 'count'],
        'price': ['mean', 'std'] if 'price' in trade_data.columns else None
    })
    
    # Flatten column names
    trader_metrics.columns = [
        f"{col[0]}_{col[1]}" if col[1] else col[0] 
        for col in trader_metrics.columns
    ]
    
    # Reset index to make trader_id a column
    trader_metrics = trader_metrics.reset_index()
    
    # Calculate key statistics
    total_traders = len(trader_metrics)
    total_volume = trader_metrics['trade_amount_sum'].sum()
    avg_trades_per_trader = trader_metrics['trade_amount_count'].mean()
    median_trades_per_trader = trader_metrics['trade_amount_count'].median()
    
    print(f"Total traders: {total_traders:,}")
    print(f"Total volume: {total_volume:,.2f}")
    print(f"Average trades per trader: {avg_trades_per_trader:.2f}")
    print(f"Median trades per trader: {median_trades_per_trader:.0f}")
    
    # Create visualizations if enabled
    if config['generate_plots']:
        # 1. Trade count distribution
        plt.figure(figsize=(12, 8))
        
        # Use log scale for better visualization
        log_counts = np.log10(trader_metrics['trade_amount_count'] + 1)  # +1 to handle zeros
        
        plt.hist(log_counts, bins=50, alpha=0.7, color='skyblue')
        plt.title('Trader Activity Distribution (Log Scale)')
        plt.xlabel('Log10(Number of Trades)')
        plt.ylabel('Number of Traders')
        plt.grid(alpha=0.3)
        
        # Save plot
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f"{save_prefix}_activity.png"), dpi=300)
        plt.close()
        
        # 2. Volume distribution
        plt.figure(figsize=(12, 8))
        
        # Use log scale for better visualization
        log_volumes = np.log10(trader_metrics['trade_amount_sum'] + 1)  # +1 to handle zeros
        
        plt.hist(log_volumes, bins=50, alpha=0.7, color='green')
        plt.title('Trader Volume Distribution (Log Scale)')
        plt.xlabel('Log10(Trading Volume)')
        plt.ylabel('Number of Traders')
        plt.grid(alpha=0.3)
        
        # Save plot
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f"{save_prefix}_volume.png"), dpi=300)
        plt.close()
    
    # Create summary statistics for return
    deciles = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99]
    volume_percentiles = {
        f"{p}th_percentile": trader_metrics['trade_amount_sum'].quantile(p/100)
        for p in deciles
    }
    
    activity_percentiles = {
        f"{p}th_percentile": trader_metrics['trade_amount_count'].quantile(p/100)
        for p in deciles
    }
    
    summary = {
        'total_traders': total_traders,
        'total_volume': total_volume,
        'avg_trades_per_trader': avg_trades_per_trader,
        'median_trades_per_trader': median_trades_per_trader,
        'volume_percentiles': volume_percentiles,
        'activity_percentiles': activity_percentiles
    }
    
    # Save summary if enabled
    if config['save_results']:
        with open(os.path.join(results_dir, f"{save_prefix}_summary.json"), 'w') as f:
            json.dump(summary, f, indent=2, default=str)
    
    return summary, trader_metrics


## b. Whale Identification


In [177]:
def identify_whales(trade_data, config, save_prefix='whale_identification'):
    """
    Identify whale traders based on specified criteria
    
    Parameters:
    -----------
    trade_data : pd.DataFrame
        DataFrame with trade data
    config : dict
        Analysis configuration
    save_prefix : str
        Prefix for saved files
        
    Returns:
    --------
    tuple
        (whale_ids, whale_results) - List of whale IDs and analysis results
    """
    print("\n" + "="*80)
    print("WHALE TRADER IDENTIFICATION")
    print("="*80)
    
    if trade_data is None or len(trade_data) == 0:
        print("No trade data available for analysis")
        return [], {}
    
    # Ensure we have the necessary columns
    if 'trader_id' not in trade_data.columns or 'trade_amount' not in trade_data.columns:
        print("Error: Missing required columns (trader_id, trade_amount)")
        return [], {}
    
    # Get configuration parameters
    threshold = config['whale_threshold']
    results_dir = config['results_dir']
    generate_plots = config['generate_plots']
    
    os.makedirs(results_dir, exist_ok=True)
    
    # Group trades by trader and calculate total volume
    trader_volumes = trade_data.groupby('trader_id')['trade_amount'].sum().sort_values(ascending=False)
    
    # Calculate total volume
    total_volume = trader_volumes.sum()
    total_traders = len(trader_volumes)
    
    print(f"Total traders: {total_traders:,}")
    print(f"Total volume: {total_volume:,.2f}")
    
    # Create cumulative volume percentages
    cumulative_volumes = trader_volumes.cumsum()
    cumulative_percentages = cumulative_volumes / total_volume * 100
    
    # Create DataFrame for analysis
    trader_analysis = pd.DataFrame({
        'trader_id': trader_volumes.index,
        'volume': trader_volumes.values,
        'cumulative_volume': cumulative_volumes.values,
        'volume_pct': trader_volumes.values / total_volume * 100,
        'cumulative_pct': cumulative_percentages.values
    })
    
    # Calculate Gini coefficient
    gini = calculate_gini(trader_volumes.values)
    print(f"Volume concentration (Gini coefficient): {gini:.4f}")
    
    # Define percentile thresholds to evaluate
    percentile_thresholds = [0.001, 0.01, 0.05, 0.1]
    
    # Calculate metrics for each threshold
    threshold_metrics = []
    for pct in percentile_thresholds:
        num_whales = max(1, int(total_traders * pct))
        whale_volume = trader_volumes.iloc[:num_whales].sum()
        whale_volume_pct = whale_volume / total_volume * 100
        
        # Store metrics
        threshold_metrics.append({
            'threshold': pct,
            'threshold_label': f"Top {pct*100:.1f}%",
            'num_whales': num_whales,
            'whale_volume': float(whale_volume),
            'whale_volume_pct': float(whale_volume_pct),
            'trader_pct': float(num_whales / total_traders * 100)
        })
        
        print(f"Top {pct*100:.1f}% definition ({num_whales:,} traders): {whale_volume_pct:.2f}% of volume")
    
    # Calculate volume coverage thresholds
    volume_thresholds = [50, 75, 90, 95]
    coverage_metrics = []
    
    for pct in volume_thresholds:
        # Find traders needed to reach this volume percentage
        traders_needed = sum(cumulative_percentages < pct) + 1
        traders_needed = min(traders_needed, len(trader_volumes))
        
        # Get the actual volume percentage
        actual_pct = cumulative_percentages.iloc[traders_needed-1] if traders_needed <= len(cumulative_percentages) else 100
        
        coverage_metrics.append({
            'volume_threshold': pct,
            'threshold_label': f"{pct}% Volume",
            'num_traders': int(traders_needed),
            'actual_volume_pct': float(actual_pct),
            'trader_pct': float(traders_needed / total_traders * 100)
        })
        
        print(f"Traders needed for {pct}% volume: {traders_needed:,} ({traders_needed/total_traders*100:.4f}% of all traders)")
    
    # Create visualizations
    if generate_plots:
        # Create figure for combined plots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
        
        # 1. Lorenz curve
        ax1.plot(np.linspace(0, 100, len(trader_volumes)), 
                 np.insert(cumulative_percentages.values, 0, 0), 
                 'b-', linewidth=2, label='Volume distribution')
        ax1.plot([0, 100], [0, 100], 'k--', label='Perfect equality')
        ax1.fill_between(np.linspace(0, 100, len(trader_volumes)), 
                          np.insert(cumulative_percentages.values, 0, 0), 
                          np.linspace(0, 100, len(trader_volumes)+1), 
                          alpha=0.2)
        
        # Add key percentiles
        for p in [90, 95, 99, 99.9]:
            # Calculate index for this percentile
            idx = min(int(total_traders * (100-p)/100), len(trader_volumes)-1)
            if idx >= 0:
                # Get x and y coordinates
                x = idx / total_traders * 100
                y = cumulative_percentages.iloc[idx] if idx < len(cumulative_percentages) else 100
                
                # Add reference lines
                ax1.plot([x, x], [0, y], 'r--', alpha=0.5)
                ax1.plot([0, x], [y, y], 'r--', alpha=0.5)
                
                # Add label
                ax1.text(x + 1, 10 + (p-90)*3, f'Top {100-p}%', fontsize=10)
        
        ax1.set_title(f'Trading Volume Distribution (Gini: {gini:.4f})')
        ax1.set_xlabel('Cumulative % of Traders')
        ax1.set_ylabel('Cumulative % of Volume')
        ax1.grid(alpha=0.3)
        ax1.legend()
        
        # 2. Whale definition comparison
        percent_definitions = pd.DataFrame(threshold_metrics)
        
        # Plot bars for percentage of traders vs percentage of volume
        bar_width = 0.35
        x = np.arange(len(percent_definitions))
        
        ax2.bar(x - bar_width/2, percent_definitions['trader_pct'], 
               bar_width, label='% of Traders', color='skyblue')
        ax2.bar(x + bar_width/2, percent_definitions['whale_volume_pct'], 
               bar_width, label='% of Volume', color='orange')
        
        # Set x-axis labels
        ax2.set_xticks(x)
        ax2.set_xticklabels(percent_definitions['threshold_label'])
        
        # Add value labels on bars
        for i, v in enumerate(percent_definitions['trader_pct']):
            ax2.text(i - bar_width/2, v + 1, f"{v:.2f}%", ha='center', fontsize=9)
        
        for i, v in enumerate(percent_definitions['whale_volume_pct']):
            ax2.text(i + bar_width/2, v + 1, f"{v:.2f}%", ha='center', fontsize=9)
        
        ax2.set_title('Whale Definitions Comparison')
        ax2.set_ylabel('Percentage')
        ax2.set_ylim(0, 100)
        ax2.grid(axis='y', alpha=0.3)
        ax2.legend()
        
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f"{save_prefix}_analysis.png"), dpi=300)
        plt.close()
        
        print(f"Whale definition analysis visualizations saved to {save_prefix}_analysis.png")
    
    # Use the specified threshold
    num_whales = max(1, int(total_traders * threshold))
    whale_ids = trader_volumes.head(num_whales).index.tolist()
    
    print(f"\nUsing top {threshold*100:.1f}% definition: {num_whales:,} whales")
    print(f"Selected whale threshold volume: {trader_volumes.iloc[num_whales-1] if num_whales <= len(trader_volumes) else 0:.2f}")
    
    # Save results if enabled
    if config['save_results']:
        results = {
            'gini_coefficient': gini,
            'threshold_used': threshold,
            'num_whales': num_whales,
            'threshold_metrics': threshold_metrics,
            'coverage_metrics': coverage_metrics,
            'whale_volume_percentage': trader_volumes.head(num_whales).sum() / total_volume * 100
        }
        
        with open(os.path.join(results_dir, f"{save_prefix}_results.json"), 'w') as f:
            json.dump(results, f, indent=2)
    
    # Return whale IDs and analysis results
    return whale_ids, {
        'trader_analysis': trader_analysis,
        'threshold_metrics': threshold_metrics,
        'coverage_metrics': coverage_metrics,
        'gini': gini,
        'selected_threshold': threshold,
        'selected_num_whales': num_whales
    }

## c. Gini coefficient

In [178]:

def calculate_gini(values):
    """
    Calculate Gini coefficient for an array of values
    """
    # Handle edge cases
    if len(values) <= 1 or np.sum(values) == 0:
        return 0
    
    # Sort values
    sorted_values = np.sort(values)
    n = len(sorted_values)
    
    # Calculate cumulative sum
    cumsum = np.cumsum(sorted_values)
    
    # Calculate Gini coefficient using the formula
    return (n + 1 - 2 * np.sum((n + 1 - np.arange(1, n+1)) * sorted_values) / np.sum(sorted_values)) / n


## d. Trader Classification

In [179]:
def classify_traders(trade_data, config, save_prefix='trader_classification'):
    """
    Classify traders into different types based on behavior patterns
    
    Parameters:
    -----------
    trade_data : pd.DataFrame
        DataFrame with trade data
    config : dict
        Analysis configuration
    save_prefix : str
        Prefix for saved files
        
    Returns:
    --------
    dict
        Dictionary with classification results
    """
    print("\n" + "="*80)
    print("TRADER CLASSIFICATION ANALYSIS")
    print("="*80)
    
    if trade_data is None or len(trade_data) == 0:
        print("No trade data available for analysis")
        return None
    
    # Configuration parameters
    n_clusters = config['trader_clusters']
    results_dir = config['results_dir']
    generate_plots = config['generate_plots']
    
    os.makedirs(results_dir, exist_ok=True)
    
    print("Calculating trader features...")
    
    # Group by trader_id and calculate features
    trader_features = []
    
    # For each trader, calculate features
    for trader_id in trade_data['trader_id'].unique():
        # Get all trades for this trader
        trader_trades = trade_data[trade_data['trader_id'] == trader_id]
        
        # Skip traders with too few trades
        if len(trader_trades) < 3:
            continue
            
        # Basic activity metrics
        trade_count = len(trader_trades)
        
        # Trade size metrics
        if 'trade_amount' in trader_trades.columns:
            avg_trade_size = trader_trades['trade_amount'].mean()
            total_volume = trader_trades['trade_amount'].sum()
            trade_size_volatility = trader_trades['trade_amount'].std() / avg_trade_size if avg_trade_size > 0 else 0
        else:
            avg_trade_size = np.nan
            total_volume = np.nan
            trade_size_volatility = np.nan
        
        # Trader diversity (market participation)
        if 'market_id' in trader_trades.columns:
            market_count = trader_trades['market_id'].nunique()
            market_concentration = (trader_trades.groupby('market_id').size() / trade_count).max()
        else:
            market_count = 1
            market_concentration = 1.0
            
        # Trading direction bias
        if 'side' in trader_trades.columns:
            buy_count = (trader_trades['side'] == 'buy').sum()
            sell_count = (trader_trades['side'] == 'sell').sum()
            if buy_count + sell_count > 0:
                buy_ratio = buy_count / (buy_count + sell_count)
            else:
                buy_ratio = 0.5
        else:
            buy_ratio = 0.5
            
        # Store features
        trader_features.append({
            'trader_id': trader_id,
            'trade_count': trade_count,
            'avg_trade_size': avg_trade_size,
            'total_volume': total_volume,
            'trade_size_volatility': trade_size_volatility,
            'market_count': market_count,
            'market_concentration': market_concentration,
            'buy_ratio': buy_ratio
        })
    
    if not trader_features:
        print("No trader features calculated")
        return None
        
    # Create DataFrame
    trader_df = pd.DataFrame(trader_features)
    print(f"Calculated features for {len(trader_df)} traders")
    
    # Select features for clustering
    features_for_clustering = [
        'trade_count', 'avg_trade_size', 'market_concentration', 
        'buy_ratio', 'trade_size_volatility'
    ]
    
    # Filter to available features and remove any with all NaN values
    available_features = []
    for f in features_for_clustering:
        if f in trader_df.columns and not trader_df[f].isna().all():
            available_features.append(f)
    
    print(f"Using {len(available_features)} features for clustering: {available_features}")
    
    if len(available_features) < 2:
        print("Insufficient features for clustering")
        return None
        
    # Handle missing values
    X = trader_df[available_features].copy()
    X = X.fillna(X.mean())
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print("Performing clustering...")
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    trader_df['cluster'] = kmeans.fit_predict(X_scaled)
    
    # Calculate cluster profiles
    cluster_profiles = trader_df.groupby('cluster')[available_features].mean()
    cluster_sizes = trader_df['cluster'].value_counts().sort_index()
    cluster_profiles['size'] = cluster_sizes.values
    cluster_profiles['percentage'] = 100 * cluster_sizes / cluster_sizes.sum()
    
    # Interpret clusters
    cluster_names = {}
    for cluster_id in range(n_clusters):
        profile = cluster_profiles.loc[cluster_id]
        
        # Calculate z-scores for this cluster compared to others
        z_scores = {}
        for feature in available_features:
            feature_mean = cluster_profiles[feature].mean()
            feature_std = cluster_profiles[feature].std()
            if feature_std > 0:
                z_scores[feature] = (profile[feature] - feature_mean) / feature_std
            else:
                z_scores[feature] = 0
                
        # Determine cluster type based on most extreme z-scores
        top_feature = max(z_scores.items(), key=lambda x: abs(x[1]))
        
        if top_feature[0] == 'trade_count' and top_feature[1] > 1:
            name = "High Frequency Traders"
        elif top_feature[0] == 'avg_trade_size' and top_feature[1] > 1:
            name = "Whale Traders"
        elif top_feature[0] == 'market_concentration' and top_feature[1] > 1:
            name = "Market Specialists" 
        elif top_feature[0] == 'buy_ratio':
            if top_feature[1] > 1:
                name = "Bullish Traders"
            else:
                name = "Bearish Traders"
        elif top_feature[0] == 'trade_size_volatility' and top_feature[1] > 1:
            name = "Opportunistic Traders"
        else:
            name = "Balanced Traders"
            
        cluster_names[cluster_id] = name
    
    # Add names to profiles
    cluster_profiles['type'] = [cluster_names[i] for i in cluster_profiles.index]
    
    # Create visualizations if enabled
    if generate_plots:
        # Create PCA visualization
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_scaled)
        
        # Create visualization DataFrame
        viz_df = pd.DataFrame({
            'PC1': X_pca[:, 0],
            'PC2': X_pca[:, 1],
            'Cluster': trader_df['cluster'],
            'Type': trader_df['cluster'].map(cluster_names)
        })
        
        # Calculate explained variance
        explained_variance = pca.explained_variance_ratio_
        
        # Create PCA plot
        plt.figure(figsize=(10, 8))
        sns.scatterplot(data=viz_df, x='PC1', y='PC2', hue='Type', palette='viridis', s=50, alpha=0.7)
        plt.title('Trader Types - PCA Visualization')
        plt.xlabel(f'PC1 ({explained_variance[0]:.1%} variance)')
        plt.ylabel(f'PC2 ({explained_variance[1]:.1%} variance)')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f"{save_prefix}_pca.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Create bar chart of cluster sizes
        plt.figure(figsize=(12, 6))
        
        # Sort by size
        sorted_profiles = cluster_profiles.sort_values('size', ascending=False)
        
        # Create bar chart
        plt.bar(
            range(len(sorted_profiles)), 
            sorted_profiles['size'],
            tick_label=[f"{cluster_names[i]}\\n({sorted_profiles.loc[i, 'percentage']:.1f}%)" 
                       for i in sorted_profiles.index]
        )
        
        plt.title('Trader Type Distribution')
        plt.ylabel('Number of Traders')
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f"{save_prefix}_distribution.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    # Calculate feature importance
    feature_importance = {}
    for i, feature in enumerate(available_features):
        feature_importance[feature] = np.abs(kmeans.cluster_centers_).mean(axis=0)[i]
    
    # Save results if enabled
    if config['save_results']:
        results = {
            'cluster_profiles': cluster_profiles.to_dict(),
            'cluster_names': cluster_names,
            'feature_importance': feature_importance,
            'trader_counts': cluster_sizes.to_dict()
        }
        
        with open(os.path.join(results_dir, f"{save_prefix}_results.json"), 'w') as f:
            json.dump(results, f, indent=2, default=str)
    
    # Add trader type to original dataframe
    trader_df['trader_type'] = trader_df['cluster'].map(cluster_names)
    
    # Print summary statistics by trader type
    print("\nTrader Type Summary:")
    for cluster_id, name in cluster_names.items():
        cluster_size = cluster_sizes[cluster_id]
        cluster_pct = cluster_size / len(trader_df) * 100
        print(f"  {name}: {cluster_size:,} traders ({cluster_pct:.1f}%)")
    
    # Print feature importance
    print("\nFeature Importance:")
    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    for feature, importance in sorted_features:
        print(f"  {feature}: {importance:.4f}")
    
    return {
        'trader_features': trader_df,
        'cluster_profiles': cluster_profiles,
        'cluster_names': cluster_names,
        'feature_importance': feature_importance
    }

## e. Market Dynamics

In [180]:
def analyze_market_dynamics(trade_data, whale_ids, market_data, config, save_prefix='market_dynamics'):
    """
    Analyze market dynamics including whale impact and price movements
    
    Parameters:
    -----------
    trade_data : pd.DataFrame
        DataFrame with trade data
    whale_ids : list
        List of whale trader IDs
    market_data : pd.DataFrame
        DataFrame with market metadata
    config : dict
        Analysis configuration
    save_prefix : str
        Prefix for saved files
        
    Returns:
    --------
    dict
        Dictionary with market dynamics analysis results
    """
    print("\n" + "="*80)
    print("MARKET DYNAMICS ANALYSIS")
    print("="*80)
    
    if trade_data is None or len(trade_data) == 0:
        print("No trade data available for analysis")
        return None
    
    if not whale_ids:
        print("No whale traders identified for analysis")
        return None
    
    results_dir = config['results_dir']
    generate_plots = config['generate_plots']
    
    os.makedirs(results_dir, exist_ok=True)
    
    # Clean the data
    df = trade_data.copy()
    
    # Add whale indicator
    df['is_whale'] = df['trader_id'].isin(whale_ids)
    
    # Ensure price is numeric
    if 'price' in df.columns:
        df['price'] = pd.to_numeric(df['price'], errors='coerce')
    
    # Split whale and non-whale trades
    whale_trades = df[df['is_whale']]
    non_whale_trades = df[~df['is_whale']]
    
    print(f"Analyzing {len(whale_trades)} whale trades and {len(non_whale_trades)} non-whale trades")
    
    # 1. Market-level price impact analysis
    if 'market_id' in df.columns and 'price' in df.columns:
        market_impacts = []
        
        for market_id, market_df in df.groupby('market_id'):
            # Skip markets with too few trades
            if len(market_df) < 10:
                continue
                
            # Sort by timestamp
            if 'timestamp' in market_df.columns:
                if not pd.api.types.is_datetime64_any_dtype(market_df['timestamp']):
                    try:
                        market_df['timestamp'] = pd.to_datetime(market_df['timestamp'], errors='coerce')
                        market_df = market_df.dropna(subset=['timestamp'])
                    except:
                        # Create a sequential timestamp if conversion fails
                        market_df = market_df.sort_index()
                        market_df['timestamp'] = pd.Series(range(len(market_df)))
                
                market_df = market_df.sort_values('timestamp')
            
            # Calculate price changes
            market_df['price_change'] = market_df['price'].diff()
            
            # Skip markets with no price changes
            if market_df['price_change'].isna().all() or market_df['price_change'].abs().sum() == 0:
                continue
            
            # Separate whale and non-whale trades
            market_whale_trades = market_df[market_df['is_whale']]
            market_non_whale_trades = market_df[~market_df['is_whale']]
            
            # Skip markets with no whale trades
            if len(market_whale_trades) == 0:
                continue
                
            # Calculate average metrics
            whale_avg_change = market_whale_trades['price_change'].mean()
            non_whale_avg_change = market_non_whale_trades['price_change'].mean()
            
            # Calculate directional impact
            whale_pos_pct = (market_whale_trades['price_change'] > 0).mean() * 100
            whale_neg_pct = (market_whale_trades['price_change'] < 0).mean() * 100
            non_whale_pos_pct = (market_non_whale_trades['price_change'] > 0).mean() * 100
            non_whale_neg_pct = (market_non_whale_trades['price_change'] < 0).mean() * 100
            
            # Get market name if available
            market_name = market_data.loc[market_data['id'] == market_id, 'question'].iloc[0] if 'question' in market_data.columns else f"Market {market_id}"
            
            market_impacts.append({
                'market_id': market_id,
                'market_name': market_name,
                'total_trades': len(market_df),
                'whale_trades': len(market_whale_trades),
                'non_whale_trades': len(market_non_whale_trades),
                'whale_trade_pct': len(market_whale_trades) / len(market_df) * 100,
                'whale_avg_change': float(whale_avg_change),
                'non_whale_avg_change': float(non_whale_avg_change),
                'whale_pos_pct': float(whale_pos_pct),
                'whale_neg_pct': float(whale_neg_pct),
                'non_whale_pos_pct': float(non_whale_pos_pct),
                'non_whale_neg_pct': float(non_whale_neg_pct)
            })
        
        # Create markets DataFrame
        if market_impacts:
            markets_df = pd.DataFrame(market_impacts)
            
            # Calculate weighted averages
            weighted_whale_impact = np.average(
                markets_df['whale_avg_change'],
                weights=markets_df['whale_trades']
            )
            
            weighted_non_whale_impact = np.average(
                markets_df['non_whale_avg_change'],
                weights=markets_df['non_whale_trades']
            )
            
            print(f"\nWeighted average whale price impact: {weighted_whale_impact:.6f}")
            print(f"Weighted average non-whale price impact: {weighted_non_whale_impact:.6f}")
            
            # Calculate impact ratio if possible
            if weighted_non_whale_impact != 0:
                impact_ratio = weighted_whale_impact / weighted_non_whale_impact
                print(f"Impact ratio (whale/non-whale): {impact_ratio:.4f}")
            else:
                impact_ratio = None
                print("Impact ratio cannot be calculated (division by zero)")
            
            # Create visualizations
            if generate_plots:
                plt.figure(figsize=(15, 12))
                
                # 1. Market-by-market comparison
                plt.subplot(2, 1, 1)
                
                # Sort markets by whale impact
                sorted_markets = markets_df.sort_values('whale_avg_change')
                
                # Plot whale vs non-whale impact by market
                plt.scatter(range(len(sorted_markets)), sorted_markets['whale_avg_change'], 
                           label='Whale impact', alpha=0.7, s=50, color='blue')
                plt.scatter(range(len(sorted_markets)), sorted_markets['non_whale_avg_change'], 
                           label='Non-whale impact', alpha=0.7, s=50, color='orange')
                
                plt.axhline(y=0, color='r', linestyle='--')
                plt.title('Price Impact by Market')
                plt.xlabel('Markets (sorted by whale impact)')
                plt.ylabel('Average Price Change')
                plt.legend()
                plt.grid(alpha=0.3)
                
                # 2. Direction comparison
                plt.subplot(2, 1, 2)
                
                # Calculate average positive/negative percentages
                avg_whale_pos = markets_df['whale_pos_pct'].mean()
                avg_whale_neg = markets_df['whale_neg_pct'].mean()
                avg_nonwhale_pos = markets_df['non_whale_pos_pct'].mean()
                avg_nonwhale_neg = markets_df['non_whale_neg_pct'].mean()
                
                # Plot directional impact
                labels = ['Whale', 'Non-whale']
                pos_values = [avg_whale_pos, avg_nonwhale_pos]
                neg_values = [avg_whale_neg, avg_nonwhale_neg]
                neutral_values = [100 - avg_whale_pos - avg_whale_neg, 
                                 100 - avg_nonwhale_pos - avg_nonwhale_neg]
                
                width = 0.35
                x = np.arange(len(labels))
                
                plt.bar(x, pos_values, width, label='Positive impact', color='green')
                plt.bar(x, neg_values, width, bottom=pos_values, label='Negative impact', color='red')
                plt.bar(x, neutral_values, width, 
                       bottom=[pos_values[i] + neg_values[i] for i in range(len(pos_values))], 
                       label='Neutral', color='gray')
                
                plt.title('Direction of Price Impact')
                plt.ylabel('Percentage of Trades')
                plt.xlabel('Trader Type')
                plt.xticks(x, labels)
                plt.legend()
                
                plt.tight_layout()
                plt.savefig(os.path.join(results_dir, f"{save_prefix}_price_impact.png"), dpi=300)
                plt.close()
                
                print(f"Market price impact visualization saved to {save_prefix}_price_impact.png")
            
            # Save results if enabled
            if config['save_results']:
                results = {
                    'market_impacts': market_impacts,
                    'weighted_whale_impact': float(weighted_whale_impact),
                    'weighted_non_whale_impact': float(weighted_non_whale_impact),
                    'impact_ratio': float(impact_ratio) if impact_ratio is not None else None,
                    'avg_whale_positive_pct': float(avg_whale_pos),
                    'avg_whale_negative_pct': float(avg_whale_neg),
                    'avg_nonwhale_positive_pct': float(avg_nonwhale_pos),
                    'avg_nonwhale_negative_pct': float(avg_nonwhale_neg)
                }
                
                with open(os.path.join(results_dir, f"{save_prefix}_results.json"), 'w') as f:
                    json.dump(results, f, indent=2)
            
            return {
                'market_impacts': market_impacts,
                'weighted_whale_impact': float(weighted_whale_impact),
                'weighted_non_whale_impact': float(weighted_non_whale_impact),
                'impact_ratio': float(impact_ratio) if impact_ratio is not None else None,
                'direction_metrics': {
                    'whale_positive_pct': float(avg_whale_pos),
                    'whale_negative_pct': float(avg_whale_neg),
                    'non_whale_positive_pct': float(avg_nonwhale_pos),
                    'non_whale_negative_pct': float(avg_nonwhale_neg)
                }
            }
    
    # If market-level analysis was not possible, do overall analysis
    print("Analyzing overall price changes...")
    
    # Calculate metrics
    whale_avg_change = whale_trades['price_change'].mean() if 'price_change' in whale_trades.columns else None
    whale_median_change = whale_trades['price_change'].median() if 'price_change' in whale_trades.columns else None
    whale_std_change = whale_trades['price_change'].std() if 'price_change' in whale_trades.columns else None
    
    non_whale_avg_change = non_whale_trades['price_change'].mean() if 'price_change' in non_whale_trades.columns else None
    non_whale_median_change = non_whale_trades['price_change'].median() if 'price_change' in non_whale_trades.columns else None
    non_whale_std_change = non_whale_trades['price_change'].std() if 'price_change' in non_whale_trades.columns else None
    
    if all(x is not None for x in [whale_avg_change, non_whale_avg_change]):
        print(f"\nWhale trades average price change: {whale_avg_change:.6f}")
        print(f"Non-whale trades average price change: {non_whale_avg_change:.6f}")
    else:
        print("Price change metrics not available")
        
    return {
        'overall_metrics': {
            'whale_avg_change': float(whale_avg_change) if whale_avg_change is not None else None,
            'whale_median_change': float(whale_median_change) if whale_median_change is not None else None,
            'whale_std_change': float(whale_std_change) if whale_std_change is not None else None,
            'non_whale_avg_change': float(non_whale_avg_change) if non_whale_avg_change is not None else None,
            'non_whale_median_change': float(non_whale_median_change) if non_whale_median_change is not None else None,
            'non_whale_std_change': float(non_whale_std_change) if non_whale_std_change is not None else None
        }
    }

## f. Lorenz Curve

In [181]:
def create_lorenz_curve_visualization(whale_analysis_df):
    """
    Create Lorenz curve visualization for trader volume distribution
    
    Parameters:
    -----------
    whale_analysis_df : pd.DataFrame
        DataFrame with trader volume analysis
    """
    plt.figure(figsize=(10, 8))
    
    # Sort traders by volume
    df = whale_analysis_df.sort_values('volume')
    
    # Calculate cumulative percentages
    df['trader_pct'] = np.arange(1, len(df) + 1) / len(df) * 100
    df['volume_pct'] = df['volume'].cumsum() / df['volume'].sum() * 100
    
    # Plot Lorenz curve
    plt.plot(df['trader_pct'], df['volume_pct'], label='Trading volume distribution')
    
    # Plot line of equality
    plt.plot([0, 100], [0, 100], 'k--', label='Perfect equality')
    
    # Fill the area representing the Gini coefficient
    plt.fill_between(df['trader_pct'], df['trader_pct'], df['volume_pct'], alpha=0.2)
    
    # Calculate Gini coefficient
    gini = 1 - np.trapz(df['volume_pct'], df['trader_pct']) / 5000  # Area under perfect equality is 5000 (100*100/2)
    
    # Add key percentiles
    percentiles = [90, 95, 99, 99.9]
    for p in percentiles:
        threshold_idx = int(len(df) * (100 - p) / 100)
        if threshold_idx < len(df):
            x = df['trader_pct'].iloc[threshold_idx]
            y = df['volume_pct'].iloc[threshold_idx]
            plt.plot([x, x], [0, y], 'r--', alpha=0.5)
            plt.plot([0, x], [y, y], 'r--', alpha=0.5)
            plt.text(x + 1, y - 5, f'Top {100-p}%', fontsize=10)
    
    plt.title(f'Trading Volume Distribution (Gini Coefficient: {gini:.4f})')
    plt.xlabel('Cumulative % of Traders')
    plt.ylabel('Cumulative % of Volume')
    plt.grid(alpha=0.3)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('trading_volume_lorenz.png', dpi=300)
    plt.close()
    
    print(f"Lorenz curve visualization saved as trading_volume_lorenz.png")
    return gini

## g. Market Accuracy

In [182]:

def analyze_market_accuracy_by_whale_activity(trades_df, market_data, whale_ids):
    """
    Analyze how whale activity correlates with market prediction accuracy
    
    Parameters:
    -----------
    trades_df : pd.DataFrame
        DataFrame with trade data
    market_data : pd.DataFrame
        DataFrame with market-level data including accuracy metrics
    whale_ids : list
        List of whale trader IDs
    
    Returns:
    --------
    dict
        Dictionary with accuracy analysis results
    """
    print("Analyzing market accuracy by whale activity...")
    
    # Verify we have required columns in market_data
    required_cols = ['market_id', 'brier_score']
    missing_cols = [col for col in required_cols if col not in market_data.columns]
    
    if missing_cols:
        print(f"Error: Missing required columns in market_data: {missing_cols}")
        print("Required columns: market_id, brier_score (or other accuracy metric)")
        return None
    
    # Clean trades data
    df = trades_df.copy()
    df['is_whale'] = df['trader_id'].isin(whale_ids)
    
    # Calculate whale activity metrics per market
    market_metrics = []
    
    for market_id, market_trades in df.groupby('market_id'):
        whale_trades = market_trades[market_trades['is_whale']]
        non_whale_trades = market_trades[~market_trades['is_whale']]
        
        # Skip markets with too few trades
        if len(market_trades) < 10:
            continue
            
        metrics = {
            'market_id': market_id,
            'total_trades': len(market_trades),
            'whale_trades': len(whale_trades),
            'non_whale_trades': len(non_whale_trades),
            'whale_ratio': len(whale_trades) / len(market_trades) if len(market_trades) > 0 else 0,
            'unique_whales': whale_trades['trader_id'].nunique(),
            'unique_non_whales': non_whale_trades['trader_id'].nunique(),
            'whale_volume': whale_trades['trade_amount'].sum() if 'trade_amount' in whale_trades else 0,
            'non_whale_volume': non_whale_trades['trade_amount'].sum() if 'trade_amount' in non_whale_trades else 0,
            'whale_volume_ratio': (
                whale_trades['trade_amount'].sum() / market_trades['trade_amount'].sum() 
                if 'trade_amount' in market_trades and market_trades['trade_amount'].sum() > 0 
                else 0
            ),
        }
        
        market_metrics.append(metrics)
    
    # Convert to DataFrame
    metrics_df = pd.DataFrame(market_metrics)
    
    # Merge with market accuracy data
    merged_df = metrics_df.merge(
        market_data[['market_id', 'brier_score']], 
        on='market_id', 
        how='inner'
    )
    
    if len(merged_df) == 0:
        print("Error: Could not merge trade metrics with market accuracy data")
        return None
    
    # Calculate correlations between whale activity and accuracy
    whale_metrics = ['whale_ratio', 'unique_whales', 'whale_volume_ratio']
    correlations = {}
    
    for metric in whale_metrics:
        if metric in merged_df.columns:
            corr = merged_df[metric].corr(merged_df['brier_score'])
            correlations[f"{metric}_correlation"] = corr
            print(f"Correlation between {metric} and Brier score: {corr:.4f}")
    
    # Group by whale activity level
    merged_df['whale_activity_quantile'] = pd.qcut(
        merged_df['whale_ratio'], 
        q=4, 
        labels=['Low', 'Medium-Low', 'Medium-High', 'High']
    )
    
    # Calculate average accuracy by whale activity
    accuracy_by_activity = merged_df.groupby('whale_activity_quantile')['brier_score'].mean()
    
    print("\nAverage Brier score by whale activity level:")
    for activity, score in accuracy_by_activity.items():
        print(f"{activity} whale activity: {score:.4f}")
    
    return {
        'market_metrics': merged_df.to_dict('records'),
        'correlations': correlations,
        'accuracy_by_activity': accuracy_by_activity.to_dict()
    }


## h. Time

In [183]:
def analyze_trader_behavior_over_time(trades_df, whale_ids, time_unit='D'):
    """
    Analyze how trading patterns evolve over time for whales vs non-whales
    
    Parameters:
    -----------
    trades_df : pd.DataFrame
        DataFrame with trade data
    whale_ids : list
        List of whale trader IDs
    time_unit : str
        Time unit for aggregation ('D' for day, 'W' for week, etc.)
        
    Returns:
    --------
    dict
        Dictionary with temporal analysis results
    """
    print(f"Analyzing trader behavior over time (unit: {time_unit})...")
    
    # Clean the data
    df = trades_df.copy()
    
    # Ensure we have timestamps
    if 'timestamp' not in df.columns:
        print("Error: No timestamp column available")
        return None
        
    # Convert timestamp to datetime if needed
    if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
        try:
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
            df = df.dropna(subset=['timestamp'])
            print(f"Converted {len(df)} timestamps to datetime format")
        except Exception as e:
            print(f"Error converting timestamps: {e}")
            return None
    
    # Add whale indicator
    df['is_whale'] = df['trader_id'].isin(whale_ids)
    
    # Ensure price is numeric
    if 'price' in df.columns:
        df['price'] = pd.to_numeric(df['price'], errors='coerce')
    
    # Group by date and trader type
    df['date'] = df['timestamp'].dt.floor(time_unit)
    
    # Calculate metrics by date and trader type
    time_metrics = []
    
    for (date, is_whale), group in df.groupby(['date', 'is_whale']):
        # Skip groups with no trades
        if len(group) == 0:
            continue
            
        metrics = {
            'date': date,
            'is_whale': is_whale,
            'trader_type': 'Whale' if is_whale else 'Non-whale',
            'trade_count': len(group),
            'unique_traders': group['trader_id'].nunique(),
            'volume': group['trade_amount'].sum(),
            'avg_trade_size': group['trade_amount'].mean()
        }
        
        # Add price metrics if available
        if 'price' in group.columns:
            metrics.update({
                'avg_price': group['price'].mean(),
                'price_std': group['price'].std(),
                'price_range': group['price'].max() - group['price'].min() if len(group) > 1 else 0
            })
        
        time_metrics.append(metrics)
    
    # Convert to DataFrame
    time_df = pd.DataFrame(time_metrics)
    
    # Check if we have enough data
    if len(time_df) < 2:
        print("Not enough temporal data to analyze")
        return None
    
    # Create visualization
    plt.figure(figsize=(15, 12))
    
    # 1. Activity over time (top)
    plt.subplot(3, 1, 1)
    
    # Create pivot table for activity
    pivot_activity = time_df.pivot_table(
        index='date', columns='trader_type', values='trade_count'
    ).fillna(0)
    
    # Plot activity
    pivot_activity.plot(ax=plt.gca())
    plt.title('Trading Activity Over Time')
    plt.ylabel('Number of Trades')
    plt.grid(alpha=0.3)
    
    # 2. Volume over time (middle)
    plt.subplot(3, 1, 2)
    
    # Create pivot table for volume
    pivot_volume = time_df.pivot_table(
        index='date', columns='trader_type', values='volume'
    ).fillna(0)
    
    # Plot volume
    pivot_volume.plot(ax=plt.gca())
    plt.title('Trading Volume Over Time')
    plt.ylabel('Volume')
    plt.grid(alpha=0.3)
    
    # 3. Trader participation over time (bottom)
    plt.subplot(3, 1, 3)
    
    # Create pivot table for unique traders
    pivot_traders = time_df.pivot_table(
        index='date', columns='trader_type', values='unique_traders'
    ).fillna(0)
    
    # Plot trader counts
    pivot_traders.plot(ax=plt.gca())
    plt.title('Trader Participation Over Time')
    plt.ylabel('Number of Unique Traders')
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('trader_behavior_over_time.png', dpi=300)
    plt.close()
    
    print("Temporal analysis visualization saved to trader_behavior_over_time.png")
    
    # Calculate correlation between whale and non-whale activity
    if all(col in pivot_activity.columns for col in ['Whale', 'Non-whale']):
        activity_correlation = pivot_activity['Whale'].corr(pivot_activity['Non-whale'])
        print(f"Correlation between whale and non-whale activity: {activity_correlation:.4f}")
        
        # Calculate lead-lag relationship with a 1-period lag
        whale_lead_corr = pivot_activity['Whale'].shift(1).corr(pivot_activity['Non-whale'])
        nonwhale_lead_corr = pivot_activity['Non-whale'].shift(1).corr(pivot_activity['Whale'])
        
        if whale_lead_corr > nonwhale_lead_corr:
            print(f"Whales appear to lead non-whale activity (correlation: {whale_lead_corr:.4f})")
        else:
            print(f"Non-whales appear to lead whale activity (correlation: {nonwhale_lead_corr:.4f})")
    
    # Calculate trends
    if len(pivot_activity) >= 5:
        # Calculate whale activity trend
        if 'Whale' in pivot_activity.columns:
            whale_trend = pivot_activity['Whale'].rolling(min(5, len(pivot_activity))).mean()
            whale_trend_direction = np.sign(whale_trend.diff().mean())
            print(f"Whale activity trend: {'Increasing' if whale_trend_direction > 0 else 'Decreasing'}")
        
        # Calculate non-whale activity trend
        if 'Non-whale' in pivot_activity.columns:
            nonwhale_trend = pivot_activity['Non-whale'].rolling(min(5, len(pivot_activity))).mean()
            nonwhale_trend_direction = np.sign(nonwhale_trend.diff().mean())
            print(f"Non-whale activity trend: {'Increasing' if nonwhale_trend_direction > 0 else 'Decreasing'}")
    
    return {
        'time_data': time_df.to_dict('records'),
        'activity_correlation': activity_correlation if 'activity_correlation' in locals() else None,
        'whale_lead_correlation': whale_lead_corr if 'whale_lead_corr' in locals() else None,
        'nonwhale_lead_correlation': nonwhale_lead_corr if 'nonwhale_lead_corr' in locals() else None,
        'whale_trend_direction': whale_trend_direction if 'whale_trend_direction' in locals() else None,
        'nonwhale_trend_direction': nonwhale_trend_direction if 'nonwhale_trend_direction' in locals() else None
    }

## i. Price Impact

In [184]:
def analyze_whale_impact(trades_df, whale_ids):
    """
    Analyze the impact of whale trades on market prices
    
    Parameters:
    -----------
    trades_df : pd.DataFrame
        DataFrame with trade-level data
    whale_ids : list
        List of whale trader IDs
    
    Returns:
    --------
    dict
        Dictionary with whale impact analysis results
    """
    print("Analyzing whale trade impact...")
    
    # Verify whale_ids is not None and not empty
    if whale_ids is None or len(whale_ids) == 0:
        print("Error: No whale trader IDs provided")
        return None
    
    # Make a copy of the data
    df = trades_df.copy()
    
    # Check for required columns
    required_cols = ['trader_id', 'price']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Error: Missing required columns: {missing_cols}")
        return None
    
    # Clean price data
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df = df.dropna(subset=['price'])
    
    # Convert timestamp to datetime if needed
    if 'timestamp' in df.columns:
        if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
            try:
                df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
                df = df.dropna(subset=['timestamp'])
                print(f"Converted {len(df)} valid timestamps")
            except Exception as e:
                print(f"Warning: Could not convert timestamps to datetime: {e}")
                # Create sequential index as timestamp substitute
                df = df.sort_index().reset_index(drop=True)
                df['timestamp'] = df.index
    else:
        print("No timestamp column found. Creating sequential index.")
        df = df.reset_index(drop=True)
        df['timestamp'] = df.index
    
    # Sort by timestamp
    df = df.sort_values('timestamp')
    
    # Add whale indicator
    df['is_whale'] = df['trader_id'].isin(whale_ids)
    
    # Separate whale and non-whale trades
    whale_trades = df[df['is_whale']]
    non_whale_trades = df[~df['is_whale']]
    
    print(f"Found {len(whale_trades)} whale trades and {len(non_whale_trades)} non-whale trades")
    
    # Calculate price changes
    df['price_change'] = df['price'].diff()
    
    # Calculate impact by market if market_id is available
    if 'market_id' in df.columns:
        print("Analyzing price impact by market...")
        market_impacts = []
        
        for market_id, market_df in df.groupby('market_id'):
            # Skip markets with too few trades
            if len(market_df) < 10:
                continue
                
            # Sort by timestamp
            market_df = market_df.sort_values('timestamp')
            
            # Calculate price changes
            market_df['price_change'] = market_df['price'].diff()
            
            # Separate whale and non-whale trades
            market_whale_trades = market_df[market_df['is_whale']]
            market_non_whale_trades = market_df[~market_df['is_whale']]
            
            # Skip markets with no whale trades
            if len(market_whale_trades) == 0:
                continue
                
            # Calculate average metrics
            whale_avg_change = market_whale_trades['price_change'].mean()
            non_whale_avg_change = market_non_whale_trades['price_change'].mean()
            
            # Calculate directional impact
            whale_pos_pct = (market_whale_trades['price_change'] > 0).mean() * 100
            whale_neg_pct = (market_whale_trades['price_change'] < 0).mean() * 100
            non_whale_pos_pct = (market_non_whale_trades['price_change'] > 0).mean() * 100
            non_whale_neg_pct = (market_non_whale_trades['price_change'] < 0).mean() * 100
            
            market_impacts.append({
                'market_id': market_id,
                'total_trades': len(market_df),
                'whale_trades': len(market_whale_trades),
                'non_whale_trades': len(market_non_whale_trades),
                'whale_avg_change': whale_avg_change,
                'non_whale_avg_change': non_whale_avg_change,
                'whale_pos_pct': whale_pos_pct,
                'whale_neg_pct': whale_neg_pct,
                'non_whale_pos_pct': non_whale_pos_pct,
                'non_whale_neg_pct': non_whale_neg_pct
            })
        
        # Create markets DataFrame
        if market_impacts:
            markets_df = pd.DataFrame(market_impacts)
            
            # Calculate weighted averages
            weighted_whale_impact = np.average(
                markets_df['whale_avg_change'].fillna(0),
                weights=markets_df['whale_trades']
            )
            
            weighted_non_whale_impact = np.average(
                markets_df['non_whale_avg_change'].fillna(0),
                weights=markets_df['non_whale_trades']
            )
            
            print(f"\nWeighted average whale price impact: {weighted_whale_impact:.6f}")
            print(f"Weighted average non-whale price impact: {weighted_non_whale_impact:.6f}")
            
            # Calculate impact ratio if possible
            if weighted_non_whale_impact != 0:
                impact_ratio = weighted_whale_impact / weighted_non_whale_impact
                print(f"Impact ratio (whale/non-whale): {impact_ratio:.4f}")
            else:
                impact_ratio = None
                print("Impact ratio cannot be calculated (division by zero)")
            
            # Create visualization
            plt.figure(figsize=(15, 12))
            
            # 1. Market-by-market comparison
            plt.subplot(2, 1, 1)
            
            # Sort markets by whale impact
            sorted_markets = markets_df.sort_values('whale_avg_change')
            
            # Plot whale vs non-whale impact by market
            plt.scatter(range(len(sorted_markets)), sorted_markets['whale_avg_change'], 
                       label='Whale impact', alpha=0.7, s=50, color='blue')
            plt.scatter(range(len(sorted_markets)), sorted_markets['non_whale_avg_change'], 
                       label='Non-whale impact', alpha=0.7, s=50, color='orange')
            
            plt.axhline(y=0, color='r', linestyle='--')
            plt.title('Price Impact by Market')
            plt.xlabel('Markets (sorted by whale impact)')
            plt.ylabel('Average Price Change')
            plt.legend()
            plt.grid(alpha=0.3)
            
            # 2. Direction comparison
            plt.subplot(2, 1, 2)
            
            # Calculate average positive/negative percentages
            avg_whale_pos = markets_df['whale_pos_pct'].mean()
            avg_whale_neg = markets_df['whale_neg_pct'].mean()
            avg_nonwhale_pos = markets_df['non_whale_pos_pct'].mean()
            avg_nonwhale_neg = markets_df['non_whale_neg_pct'].mean()
            
            # Plot directional impact
            labels = ['Whale', 'Non-whale']
            pos_values = [avg_whale_pos, avg_nonwhale_pos]
            neg_values = [avg_whale_neg, avg_nonwhale_neg]
            neutral_values = [100 - avg_whale_pos - avg_whale_neg, 
                             100 - avg_nonwhale_pos - avg_nonwhale_neg]
            
            width = 0.35
            x = np.arange(len(labels))
            
            plt.bar(x, pos_values, width, label='Positive impact', color='green')
            plt.bar(x, neg_values, width, bottom=pos_values, label='Negative impact', color='red')
            plt.bar(x, neutral_values, width, 
                   bottom=[pos_values[i] + neg_values[i] for i in range(len(pos_values))], 
                   label='Neutral', color='gray')
            
            plt.title('Direction of Price Impact')
            plt.ylabel('Percentage of Trades')
            plt.xlabel('Trader Type')
            plt.xticks(x, labels)
            plt.legend()
            
            plt.tight_layout()
            plt.savefig('whale_impact_analysis.png', dpi=300)
            plt.close()
            
            print("Whale impact analysis visualization saved to whale_impact_analysis.png")
            
            return {
                'market_impacts': markets_df.to_dict('records'),
                'weighted_whale_impact': weighted_whale_impact,
                'weighted_non_whale_impact': weighted_non_whale_impact,
                'impact_ratio': impact_ratio,
                'direction_metrics': {
                    'whale_positive_pct': avg_whale_pos,
                    'whale_negative_pct': avg_whale_neg,
                    'non_whale_positive_pct': avg_nonwhale_pos,
                    'non_whale_negative_pct': avg_nonwhale_neg
                }
            }
    
    # If market_id not available, perform overall analysis
    print("Analyzing overall price changes...")
    
    # Calculate metrics
    whale_avg_change = whale_trades['price_change'].mean()
    whale_median_change = whale_trades['price_change'].median()
    whale_std_change = whale_trades['price_change'].std()
    
    non_whale_avg_change = non_whale_trades['price_change'].mean()
    non_whale_median_change = non_whale_trades['price_change'].median()
    non_whale_std_change = non_whale_trades['price_change'].std()
    
    print(f"\nWhale trades average price change: {whale_avg_change:.6f}")
    print(f"Non-whale trades average price change: {non_whale_avg_change:.6f}")
    
    # Calculate direction metrics
    whale_pos_pct = (whale_trades['price_change'] > 0).mean() * 100
    whale_neg_pct = (whale_trades['price_change'] < 0).mean() * 100
    non_whale_pos_pct = (non_whale_trades['price_change'] > 0).mean() * 100
    non_whale_neg_pct = (non_whale_trades['price_change'] < 0).mean() * 100
    
    print(f"Whale trades causing price increases: {whale_pos_pct:.2f}%")
    print(f"Whale trades causing price decreases: {whale_neg_pct:.2f}%")
    
    # Calculate following behavior
    df['next_is_whale'] = df['is_whale'].shift(-1)
    df['prev_is_whale'] = df['is_whale'].shift(1)
    
    # Calculate price direction
    df['price_direction'] = np.sign(df['price_change'])
    df['next_price_direction'] = df['price_direction'].shift(-1)
    df['prev_price_direction'] = df['price_direction'].shift(1)
    
    # Calculate how often non-whales follow whale direction
    whale_followed = df[(df['prev_is_whale']) & (~df['is_whale']) & 
                      (df['price_direction'] == df['prev_price_direction'])]
    whale_trades_with_followers = df[df['prev_is_whale'] & ~df['is_whale']]
    
    if len(whale_trades_with_followers) > 0:
        following_ratio = len(whale_followed) / len(whale_trades_with_followers)
        print(f"Non-whale traders follow whale price direction: {following_ratio:.2%} of the time")
    else:
        following_ratio = None
        print("Could not calculate following ratio")
    
    # Create visualization
    plt.figure(figsize=(15, 10))
    
    # 1. Price change distribution
    plt.subplot(2, 1, 1)
    
    # Calculate bins for histogram
    bin_width = max(whale_std_change, non_whale_std_change) / 5
    bins = np.arange(
        min(whale_trades['price_change'].min(), non_whale_trades['price_change'].min()) - bin_width,
        max(whale_trades['price_change'].max(), non_whale_trades['price_change'].max()) + bin_width,
        bin_width
    )
    
    # Plot histograms
    plt.hist(whale_trades['price_change'].dropna(), bins=bins, alpha=0.5, 
             label=f'Whale trades (mean={whale_avg_change:.6f})', color='blue')
    plt.hist(non_whale_trades['price_change'].dropna(), bins=bins, alpha=0.5, 
             label=f'Non-whale trades (mean={non_whale_avg_change:.6f})', color='orange')
    
    plt.axvline(x=0, color='r', linestyle='--')
    plt.axvline(x=whale_avg_change, color='blue', linestyle='-')
    plt.axvline(x=non_whale_avg_change, color='orange', linestyle='-')
    
    plt.title('Distribution of Price Changes')
    plt.xlabel('Price Change')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(alpha=0.3)
    
    # 2. Direction comparison
    plt.subplot(2, 1, 2)
    
    # Plot directional impact
    labels = ['Whale', 'Non-whale']
    pos_values = [whale_pos_pct, non_whale_pos_pct]
    neg_values = [whale_neg_pct, non_whale_neg_pct]
    neutral_values = [100 - whale_pos_pct - whale_neg_pct, 
                     100 - non_whale_pos_pct - non_whale_neg_pct]
    
    width = 0.35
    x = np.arange(len(labels))
    
    plt.bar(x, pos_values, width, label='Positive impact', color='green')
    plt.bar(x, neg_values, width, bottom=pos_values, label='Negative impact', color='red')
    plt.bar(x, neutral_values, width, 
           bottom=[pos_values[i] + neg_values[i] for i in range(len(pos_values))], 
           label='Neutral', color='gray')
    
    # Add value labels
    for i, v in enumerate(pos_values):
        plt.text(i, v/2, f"{v:.1f}%", ha='center', color='white', fontweight='bold')
    
    for i, v in enumerate(neg_values):
        plt.text(i, pos_values[i] + v/2, f"{v:.1f}%", ha='center', color='white', fontweight='bold')
    
    plt.title('Direction of Price Impact')
    plt.ylabel('Percentage of Trades')
    plt.xlabel('Trader Type')
    plt.xticks(x, labels)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('whale_impact_analysis.png', dpi=300)
    plt.close()
    
    print("Whale impact analysis visualization saved to whale_impact_analysis.png")
    
    return {
        'whale_impact': {
            'avg_change': whale_avg_change,
            'median_change': whale_median_change,
            'std_change': whale_std_change,
            'positive_pct': whale_pos_pct,
            'negative_pct': whale_neg_pct
        },
        'non_whale_impact': {
            'avg_change': non_whale_avg_change,
            'median_change': non_whale_median_change,
            'std_change': non_whale_std_change,
            'positive_pct': non_whale_pos_pct,
            'negative_pct': non_whale_neg_pct
        },
        'following_ratio': following_ratio
    }

# 5. Results Summary

In [185]:
def generate_summary_report(all_results, results_dir):
    """
    Generate a summary report of all analysis results
    
    Parameters:
    -----------
    all_results : dict
        Dictionary with analysis results
    results_dir : str
        Directory to save the report
    """
    print("\n" + "="*80)
    print("GENERATING SUMMARY REPORT")
    print("="*80)
    
    # Create report dictionary
    report = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'summary': {}
    }
    
    # Add trader distribution summary
    if 'trader_distribution' in all_results:
        dist = all_results['trader_distribution']
        report['summary']['trader_distribution'] = {
            'total_traders': dist.get('total_traders'),
            'total_volume': dist.get('total_volume'),
            'avg_trades_per_trader': dist.get('avg_trades_per_trader'),
            'median_trades_per_trader': dist.get('median_trades_per_trader')
        }
    
    # Add whale identification summary
    if 'whale_identification' in all_results:
        whale = all_results['whale_identification']
        report['summary']['whale_identification'] = {
            'gini': whale.get('gini'),
            'whale_threshold': whale.get('selected_threshold'),
            'num_whales': whale.get('selected_num_whales')
        }
    
    # Add trading inequality summary
    if 'trading_inequality' in all_results:
        ineq = all_results['trading_inequality']
        report['summary']['trading_inequality'] = {
            'gini': ineq.get('gini'),
            'top_1pct_volume_share': ineq.get('percentile_data', {}).get(99, {}).get('volume_share'),
            'power_law_exponent': ineq.get('power_law', {}).get('exponent')
        }
    
    # Add trader classification summary
    if 'trader_classification' in all_results and all_results['trader_classification']:
        class_results = all_results['trader_classification']
        if 'cluster_profiles' in class_results:
            profiles = class_results.get('cluster_profiles')
            report['summary']['trader_classification'] = {
                'num_clusters': len(profiles),
                'cluster_types': class_results.get('cluster_names'),
                'feature_importance': class_results.get('feature_importance')
            }
    
    # Add market dynamics summary
    if 'market_dynamics' in all_results:
        dynamics = all_results['market_dynamics']
        if 'weighted_whale_impact' in dynamics:
            report['summary']['market_dynamics'] = {
                'weighted_whale_impact': dynamics.get('weighted_whale_impact'),
                'weighted_non_whale_impact': dynamics.get('weighted_non_whale_impact'),
                'impact_ratio': dynamics.get('impact_ratio'),
                'direction_metrics': dynamics.get('direction_metrics')
            }
        elif 'overall_metrics' in dynamics:
            report['summary']['market_dynamics'] = dynamics.get('overall_metrics')
    
    # Save report as JSON
    with open(os.path.join(results_dir, 'analysis_summary.json'), 'w') as f:
        json.dump(report, f, indent=2, default=str)
    
    # Generate text report for quick reference
    with open(os.path.join(results_dir, 'analysis_summary.txt'), 'w') as f:
        f.write("="*80 + "\n")
        f.write("TRADER ANALYSIS SUMMARY REPORT\n")
        f.write("="*80 + "\n")
        f.write(f"Generated: {report['timestamp']}\n\n")
        
        # Add trader distribution
        if 'trader_distribution' in report['summary']:
            dist = report['summary']['trader_distribution']
            f.write("-"*30 + "\n")
            f.write("TRADER DISTRIBUTION\n")
            f.write("-"*30 + "\n")
            f.write(f"Total Traders: {dist.get('total_traders'):,}\n")
            f.write(f"Total Volume: {dist.get('total_volume'):,.2f}\n")
            f.write(f"Avg Trades per Trader: {dist.get('avg_trades_per_trader'):.2f}\n")
            f.write(f"Median Trades per Trader: {dist.get('median_trades_per_trader'):.0f}\n\n")
        
        # Add whale identification
        if 'whale_identification' in report['summary']:
            whale = report['summary']['whale_identification']
            f.write("-"*30 + "\n")
            f.write("WHALE IDENTIFICATION\n")
            f.write("-"*30 + "\n")
            f.write(f"Gini Coefficient: {whale.get('gini'):.4f}\n")
            f.write(f"Whale Threshold: Top {whale.get('whale_threshold')*100:.1f}%\n")
            f.write(f"Number of Whales: {whale.get('num_whales'):,}\n\n")
        
        # Add trading inequality
        if 'trading_inequality' in report['summary']:
            ineq = report['summary']['trading_inequality']
            f.write("-"*30 + "\n")
            f.write("TRADING INEQUALITY\n")
            f.write("-"*30 + "\n")
            f.write(f"Gini Coefficient: {ineq.get('gini'):.4f}\n")
            f.write(f"Top 1% Volume Share: {ineq.get('top_1pct_volume_share'):.2f}%\n")
            if ineq.get('power_law_exponent'):
                f.write(f"Power Law Exponent: {ineq.get('power_law_exponent'):.4f}\n\n")
        
        # Add trader classification
        if 'trader_classification' in report['summary']:
            class_results = report['summary']['trader_classification']
            f.write("-"*30 + "\n")
            f.write("TRADER CLASSIFICATION\n")
            f.write("-"*30 + "\n")
            f.write(f"Number of Trader Types: {class_results.get('num_clusters')}\n")
            f.write("Trader Types Identified:\n")
            for cluster_id, name in class_results.get('cluster_types', {}).items():
                f.write(f"  - {name}\n")
            f.write("\nMost Important Features:\n")
            sorted_features = sorted(class_results.get('feature_importance', {}).items(), 
                                     key=lambda x: x[1], reverse=True)
            for feature, importance in sorted_features[:3]:
                f.write(f"  - {feature}: {importance:.4f}\n")
            f.write("\n")
        
        # Add market dynamics
        if 'market_dynamics' in report['summary']:
            dynamics = report['summary']['market_dynamics']
            f.write("-"*30 + "\n")
            f.write("MARKET DYNAMICS\n")
            f.write("-"*30 + "\n")
            if 'weighted_whale_impact' in dynamics:
                f.write(f"Whale Price Impact: {dynamics.get('weighted_whale_impact'):.6f}\n")
                f.write(f"Non-Whale Price Impact: {dynamics.get('weighted_non_whale_impact'):.6f}\n")
                if dynamics.get('impact_ratio'):
                    f.write(f"Impact Ratio: {dynamics.get('impact_ratio'):.4f}\n")
                if 'direction_metrics' in dynamics:
                    dir_metrics = dynamics.get('direction_metrics', {})
                    f.write("\nPrice Direction:\n")
                    f.write(f"  Whale Positive: {dir_metrics.get('whale_positive_pct', 0):.2f}%\n")
                    f.write(f"  Whale Negative: {dir_metrics.get('whale_negative_pct', 0):.2f}%\n")
                    f.write(f"  Non-Whale Positive: {dir_metrics.get('non_whale_positive_pct', 0):.2f}%\n")
                    f.write(f"  Non-Whale Negative: {dir_metrics.get('non_whale_negative_pct', 0):.2f}%\n")
            elif 'whale_avg_change' in dynamics:
                f.write(f"Whale Avg Change: {dynamics.get('whale_avg_change'):.6f}\n")
                f.write(f"Non-Whale Avg Change: {dynamics.get('non_whale_avg_change'):.6f}\n")
            
    print(f"Analysis summary saved to {os.path.join(results_dir, 'analysis_summary.txt')}")
    print(f"Full JSON results saved to {os.path.join(results_dir, 'analysis_summary.json')}")

In [186]:
os.makedirs(ANALYSIS_CONFIG['results_dir'], exist_ok=True)

if results:
    print("\n" + "="*80)
    print("ANALYSIS COMPLETE")
    print("="*80)
    print(f"Results saved to {ANALYSIS_CONFIG['results_dir']}")
    
    # Display key metrics if available
    if 'trader_distribution' in results:
        print(f"\nTotal traders analyzed: {results['trader_distribution'].get('total_traders', 'N/A'):,}")
    
    if 'whale_identification' in results:
        print(f"Gini coefficient: {results['whale_identification'].get('gini', 'N/A'):.4f}")
        print(f"Whales identified: {results['whale_identification'].get('selected_num_whales', 'N/A'):,}")
    
    if 'trader_classification' in results and results['trader_classification']:
        print("\nTrader types identified:")
        for cluster_id, name in results['trader_classification'].get('cluster_names', {}).items():
            print(f"  - {name}")
else:
    print("Analysis failed or no results available")

NameError: name 'results' is not defined