# Analyzing Orderbooks Before Large Returns - Squid Ink Round 2

This notebook analyzes the state of the orderbook right before large changes in returns for Squid Ink in Round 2. The goal is to identify potential predictive patterns in the orderbook that might signal upcoming large price movements.

In [None]:
# Import necessary libraries
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Configure plots to be larger and more readable
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Try to import seaborn for better styling
try:
    import seaborn as sns
    sns.set(style="whitegrid")
    print("Using Seaborn for plot styling")
except ImportError:
    print("Seaborn not available, using matplotlib default styling")

## Section 1: Load Price Data and Calculate Returns

First, we need to load the price data for Squid Ink and calculate returns to identify periods with large price movements.

In [None]:
def load_price_data(round_num, product='SQUID_INK'):
    """
    Load price data for a specific round and product.
    
    Parameters:
        round_num (int): Round number
        product (str): Product name (default: 'SQUID_INK')
        
    Returns:
        pd.DataFrame: DataFrame containing price data
    """
    # Path to data directory - try multiple possible locations
    possible_data_paths = [
        '../../../Prosperity 3 Data',
        '../../../../Prosperity 3 Data',
        '../../../../../Prosperity 3 Data',
        'Prosperity 3 Data'
    ]
    
    # Find the first valid data path
    data_path = None
    for path in possible_data_paths:
        if os.path.exists(path):
            data_path = path
            print(f"Found data directory at {path}")
            break
    
    if data_path is None:
        print("Could not find data directory")
        return pd.DataFrame()
    
    # List all CSV files for the round
    import glob
    file_pattern = os.path.join(data_path, f'Round {round_num}/prices_round_{round_num}_day_*.csv')
    files = glob.glob(file_pattern)
    
    if not files:
        print(f"No files found matching pattern: {file_pattern}")
        return pd.DataFrame()
    
    # Load and concatenate all files
    dfs = []
    for file in files:
        print(f"Loading {file}...")
        df = pd.read_csv(file, sep=';')
        dfs.append(df)
    
    # Concatenate all dataframes
    all_data = pd.concat(dfs, ignore_index=True)
    
    # Filter for the specified product
    product_data = all_data[all_data['product'] == product].copy()
    print(f"Successfully loaded price data with {len(product_data)} rows")
    
    return product_data

In [None]:
# Load Squid Ink price data for Round 2
squid_data = load_price_data(2, 'SQUID_INK')

# Display the first few rows
squid_data.head()

In [None]:
# Check the columns in the dataframe
print("Columns in the price data:")
squid_data.columns

In [None]:
# Calculate mid price
squid_data['mid_price'] = (squid_data['ask_price_1'] + squid_data['bid_price_1']) / 2

# Sort by timestamp to ensure proper return calculation
squid_data = squid_data.sort_values('timestamp')

# Calculate returns
squid_data['returns'] = squid_data['mid_price'].pct_change()

# Calculate absolute returns
squid_data['abs_returns'] = squid_data['returns'].abs()

# Display summary statistics of returns
print("Summary statistics of returns:")
squid_data['returns'].describe()

In [None]:
# Plot the distribution of returns
plt.figure(figsize=(12, 6))
plt.hist(squid_data['returns'].dropna(), bins=100, alpha=0.7)
plt.title('Distribution of Squid Ink Returns - Round 2')
plt.xlabel('Returns')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Define what constitutes a "large" return (e.g., top 1% of absolute returns)
large_return_threshold = squid_data['abs_returns'].quantile(0.99)
print(f"Large return threshold (99th percentile): {large_return_threshold:.6f}")

# Identify timestamps with large returns
large_return_indices = squid_data[squid_data['abs_returns'] >= large_return_threshold].index
large_return_timestamps = squid_data.loc[large_return_indices, 'timestamp']

print(f"Number of large return events: {len(large_return_timestamps)}")
print(f"Percentage of all observations: {len(large_return_timestamps) / len(squid_data) * 100:.2f}%")

In [None]:
# Plot mid price and highlight large return events
plt.figure(figsize=(14, 7))
plt.plot(squid_data['timestamp'], squid_data['mid_price'], alpha=0.7)
plt.scatter(large_return_timestamps, 
            squid_data.loc[large_return_indices, 'mid_price'], 
            color='red', alpha=0.7, s=30)
plt.title('Squid Ink Mid Price with Large Return Events Highlighted - Round 2')
plt.xlabel('Timestamp')
plt.ylabel('Mid Price')
plt.grid(True)
plt.tight_layout()
plt.show()