# Blockhouse Work Trial Task - Order Book Analysis

This notebook analyzes the temporary price impact of limit orders using real market data.

## Task Overview
- Analyze order book data for CRWV, FROG, and SOUN
- Calculate temporary price impact function g_s(X)
- Answer questions about mathematical modeling
- Provide insights for optimal trading strategies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import glob
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Loading and Exploration

In [None]:
# Load a sample of data to understand structure
def load_sample_data(symbol, num_files=2):
    """Load a small sample of data for exploration"""
    symbol_folder = Path(symbol)
    csv_files = list(symbol_folder.glob(f"{symbol}_*.csv"))[:num_files]
    
    dataframes = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df['date'] = file.stem.split('_')[1]
            dataframes.append(df)
            print(f"Loaded {file.name}: {len(df)} records")
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        return combined_df
    return None

# Load sample data
symbols = ['CRWV', 'FROG', 'SOUN']
sample_data = {}

for symbol in symbols:
    print(f"\nLoading {symbol} data...")
    sample_data[symbol] = load_sample_data(symbol)
    if sample_data[symbol] is not None:
        print(f"Total records for {symbol}: {len(sample_data[symbol])}")
        print(f"Columns: {list(sample_data[symbol].columns[:10])}...")  # Show first 10 columns

In [None]:
# Examine the structure of one symbol's data
symbol = 'CRWV'
if symbol in sample_data and sample_data[symbol] is not None:
    df = sample_data[symbol]
    print(f"Data shape: {df.shape}")
    print(f"\nColumn types:")
    print(df.dtypes)
    
    print(f"\nFirst few rows:")
    display(df.head())
    
    print(f"\nSample order book levels:")
    # Show bid/ask prices and sizes for first few levels
    level_cols = [col for col in df.columns if 'px_0' in col or 'sz_0' in col]
    display(df[level_cols].head())

## 2. Order Book Analysis Functions

In [None]:
class OrderBookAnalyzer:
    def __init__(self):
        self.symbols = ['CRWV', 'FROG', 'SOUN']
        
    def calculate_mid_price(self, df):
        """Calculate mid price from best bid and ask"""
        df['mid_price'] = (df['bid_px_00'] + df['ask_px_00']) / 2
        return df
    
    def calculate_spread(self, df):
        """Calculate bid-ask spread"""
        df['spread'] = df['ask_px_00'] - df['bid_px_00']
        df['spread_bps'] = (df['spread'] / df['mid_price']) * 10000  # in basis points
        return df
    
    def get_order_book_depth(self, df, levels=10):
        """Calculate total depth at each level"""
        bid_depth = 0
        ask_depth = 0
        
        for i in range(levels):
            level_str = f"{i:02d}"
            bid_col = f'bid_sz_{level_str}'
            ask_col = f'ask_sz_{level_str}'
            
            if bid_col in df.columns and ask_col in df.columns:
                bid_depth += df[bid_col].fillna(0)
                ask_depth += df[ask_col].fillna(0)
        
        df['total_bid_depth'] = bid_depth
        df['total_ask_depth'] = ask_depth
        return df

# Initialize analyzer
analyzer = OrderBookAnalyzer()

# Process sample data
processed_data = {}
for symbol, df in sample_data.items():
    if df is not None:
        print(f"Processing {symbol}...")
        df = analyzer.calculate_mid_price(df)
        df = analyzer.calculate_spread(df)
        df = analyzer.get_order_book_depth(df)
        processed_data[symbol] = df
        
        # Basic statistics
        print(f"  Average mid price: ${df['mid_price'].mean():.4f}")
        print(f"  Average spread: {df['spread_bps'].mean():.2f} bps")
        print(f"  Average bid depth: {df['total_bid_depth'].mean():.0f} shares")
        print(f"  Average ask depth: {df['total_ask_depth'].mean():.0f} shares")

## 3. Temporary Price Impact Function g_s(X)

This is the core of the analysis - calculating how much it costs to execute X shares as a limit order.

In [None]:
def calculate_temporary_impact_function(df, side='buy', max_shares=500):
    """
    Calculate the temporary price impact function g_s(X)
    This represents the cost of executing X shares as a limit order
    """
    impact_data = []
    
    for order_size in range(10, max_shares + 1, 10):
        impacts = []
        
        for idx, row in df.iterrows():
            if side == 'buy':
                # For buy orders, we look at ask side of the book
                cumulative_size = 0
                total_value = 0
                
                for level in range(10):
                    level_str = f"{level:02d}"
                    ask_price = row[f'ask_px_{level_str}']
                    ask_size = row[f'ask_sz_{level_str}']
                    
                    if pd.isna(ask_price) or pd.isna(ask_size) or ask_size == 0:
                        continue
                        
                    remaining_needed = order_size - cumulative_size
                    if remaining_needed <= 0:
                        break
                        
                    size_to_take = min(ask_size, remaining_needed)
                    total_value += ask_price * size_to_take
                    cumulative_size += size_to_take
                    
                    if cumulative_size >= order_size:
                        break
                
                if cumulative_size > 0:
                    avg_execution_price = total_value / cumulative_size
                    mid_price = row['mid_price']
                    impact = (avg_execution_price - mid_price) / mid_price
                    impacts.append(impact)
                    
            else:  # sell orders
                # For sell orders, we look at bid side of the book
                cumulative_size = 0
                total_value = 0
                
                for level in range(10):
                    level_str = f"{level:02d}"
                    bid_price = row[f'bid_px_{level_str}']
                    bid_size = row[f'bid_sz_{level_str}']
                    
                    if pd.isna(bid_price) or pd.isna(bid_size) or bid_size == 0:
                        continue
                        
                    remaining_needed = order_size - cumulative_size
                    if remaining_needed <= 0:
                        break
                        
                    size_to_take = min(bid_size, remaining_needed)
                    total_value += bid_price * size_to_take
                    cumulative_size += size_to_take
                    
                    if cumulative_size >= order_size:
                        break
                
                if cumulative_size > 0:
                    avg_execution_price = total_value / cumulative_size
                    mid_price = row['mid_price']
                    impact = (mid_price - avg_execution_price) / mid_price
                    impacts.append(impact)
        
        if impacts:
            avg_impact = np.mean(impacts)
            impact_data.append({
                'order_size': order_size,
                'avg_impact': avg_impact,
                'impact_bps': avg_impact * 10000
            })
    
    return pd.DataFrame(impact_data)

# Calculate impact functions for all symbols
impact_results = {}

for symbol, df in processed_data.items():
    print(f"\nCalculating impact functions for {symbol}...")
    
    buy_impact = calculate_temporary_impact_function(df, side='buy')
    sell_impact = calculate_temporary_impact_function(df, side='sell')
    
    impact_results[symbol] = {
        'buy_impact': buy_impact,
        'sell_impact': sell_impact
    }
    
    print(f"  Buy impact range: {buy_impact['impact_bps'].min():.2f} to {buy_impact['impact_bps'].max():.2f} bps")
    print(f"  Sell impact range: {sell_impact['impact_bps'].min():.2f} to {sell_impact['impact_bps'].max():.2f} bps")

## 4. Visualization of Impact Functions

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

for i, symbol in enumerate(symbols):
    if symbol in impact_results:
        buy_impact = impact_results[symbol]['buy_impact']
        sell_impact = impact_results[symbol]['sell_impact']
        
        # Buy side impact
        axes[0, i].plot(buy_impact['order_size'], buy_impact['impact_bps'], 
                       'b-', linewidth=2, marker='o', markersize=4, label=f'{symbol} Buy Impact')
        axes[0, i].set_title(f'{symbol} - Buy Side Temporary Price Impact')
        axes[0, i].set_xlabel('Order Size (Shares)')
        axes[0, i].set_ylabel('Impact (bps)')
        axes[0, i].grid(True, alpha=0.3)
        axes[0, i].legend()
        
        # Sell side impact
        axes[1, i].plot(sell_impact['order_size'], sell_impact['impact_bps'], 
                       'r-', linewidth=2, marker='s', markersize=4, label=f'{symbol} Sell Impact')
        axes[1, i].set_title(f'{symbol} - Sell Side Temporary Price Impact')
        axes[1, i].set_xlabel('Order Size (Shares)')
        axes[1, i].set_ylabel('Impact (bps)')
        axes[1, i].grid(True, alpha=0.3)
        axes[1, i].legend()

plt.tight_layout()
plt.show()

In [None]:
# Comparative analysis across symbols
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

colors = ['blue', 'green', 'red']

# Buy side comparison
for i, symbol in enumerate(symbols):
    if symbol in impact_results:
        buy_impact = impact_results[symbol]['buy_impact']
        ax1.plot(buy_impact['order_size'], buy_impact['impact_bps'], 
                color=colors[i], linewidth=2, marker='o', label=f'{symbol}')

ax1.set_title('Buy Side Impact Comparison')
ax1.set_xlabel('Order Size (Shares)')
ax1.set_ylabel('Impact (bps)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Sell side comparison
for i, symbol in enumerate(symbols):
    if symbol in impact_results:
        sell_impact = impact_results[symbol]['sell_impact']
        ax2.plot(sell_impact['order_size'], sell_impact['impact_bps'], 
                color=colors[i], linewidth=2, marker='s', label=f'{symbol}')

ax2.set_title('Sell Side Impact Comparison')
ax2.set_xlabel('Order Size (Shares)')
ax2.set_ylabel('Impact (bps)')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Analysis and Insights

In [None]:
# Calculate key statistics
print("=== ORDER BOOK ANALYSIS SUMMARY ===")
print()

for symbol in symbols:
    if symbol in processed_data and symbol in impact_results:
        df = processed_data[symbol]
        buy_impact = impact_results[symbol]['buy_impact']
        sell_impact = impact_results[symbol]['sell_impact']
        
        print(f"\n{symbol} Statistics:")
        print(f"  Average mid price: ${df['mid_price'].mean():.4f}")
        print(f"  Average spread: {df['spread_bps'].mean():.2f} bps")
        print(f"  Average total depth: {(df['total_bid_depth'].mean() + df['total_ask_depth'].mean()):.0f} shares")
        print(f"  Buy impact (100 shares): {buy_impact[buy_impact['order_size']==100]['impact_bps'].iloc[0]:.2f} bps")
        print(f"  Sell impact (100 shares): {sell_impact[sell_impact['order_size']==100]['impact_bps'].iloc[0]:.2f} bps")
        print(f"  Buy impact (500 shares): {buy_impact[buy_impact['order_size']==500]['impact_bps'].iloc[0]:.2f} bps")
        print(f"  Sell impact (500 shares): {sell_impact[sell_impact['order_size']==500]['impact_bps'].iloc[0]:.2f} bps")

## 6. Answering Task Questions

### Question 1: How do you choose to model the temporary impact g_s(x)?

I model g_s(x) as the weighted average execution price impact when consuming X shares from the order book:

**For buy orders:**
- g_buy(X) = (Weighted_Avg_Ask_Price - Mid_Price) / Mid_Price

**For sell orders:**
- g_sell(X) = (Mid_Price - Weighted_Avg_Bid_Price) / Mid_Price

This approach captures the realistic cost of crossing the spread and market impact by simulating how a limit order would consume liquidity across multiple price levels.

### Question 2: Mathematical Framework

**Algorithm for g_s(X, t_i):**

Let O(t) = order book state at time t  
Let S_side(level) = size at level on side (bid/ask)  
Let P_side(level) = price at level on side (bid/ask)  

For a buy order of size X at time t_i:

1. Initialize: remaining = X, total_cost = 0, total_shares = 0
2. For level = 0 to 9:
   - available = S_ask(level)
   - take = min(remaining, available)
   - total_cost += take × P_ask(level)
   - total_shares += take
   - remaining -= take
   - if remaining = 0: break

3. g_buy(X, t_i) = (total_cost/total_shares - mid_price(t_i)) / mid_price(t_i)

The algorithm ensures mathematical rigor by:
- Using actual order book depth and prices
- Maintaining conservation of shares (∑shares_taken ≤ X)
- Providing realistic execution simulation

In [None]:
# Calculate cross-symbol statistics for comprehensive analysis
all_buy_impacts = []
all_sell_impacts = []

for symbol in symbols:
    if symbol in impact_results:
        all_buy_impacts.extend(impact_results[symbol]['buy_impact']['impact_bps'].tolist())
        all_sell_impacts.extend(impact_results[symbol]['sell_impact']['impact_bps'].tolist())

if all_buy_impacts and all_sell_impacts:
    print("\n=== CROSS-SYMBOL STATISTICS ===")
    print(f"Average buy impact across all symbols: {np.mean(all_buy_impacts):.2f} bps")
    print(f"Average sell impact across all symbols: {np.mean(all_sell_impacts):.2f} bps")
    print(f"Impact asymmetry: {abs(np.mean(all_buy_impacts) - np.mean(all_sell_impacts)):.2f} bps")
    print(f"Standard deviation of buy impacts: {np.std(all_buy_impacts):.2f} bps")
    print(f"Standard deviation of sell impacts: {np.std(all_sell_impacts):.2f} bps")

# Save results for further analysis
for symbol in symbols:
    if symbol in impact_results:
        impact_results[symbol]['buy_impact'].to_csv(f'{symbol}_buy_impact.csv', index=False)
        impact_results[symbol]['sell_impact'].to_csv(f'{symbol}_sell_impact.csv', index=False)
        print(f"Saved impact analysis for {symbol}")