## Section 2: Extract Orderbook Features

Now that we've identified periods with large returns, let's extract and analyze orderbook features right before these events.

In [None]:
# Define function to calculate orderbook features
def calculate_orderbook_features(data):
    """
    Calculate various orderbook features.
    
    Parameters:
        data (pd.DataFrame): DataFrame containing orderbook data
        
    Returns:
        pd.DataFrame: DataFrame with orderbook features
    """
    # Create a copy of the dataframe
    df = data.copy()
    
    # Calculate bid-ask spread
    df['spread'] = df['ask_price_1'] - df['bid_price_1']
    df['relative_spread'] = df['spread'] / df['mid_price']
    
    # Calculate total volume at each level
    df['bid_volume_total'] = df['bid_volume_1'] + df['bid_volume_2'].fillna(0) + df['bid_volume_3'].fillna(0)
    df['ask_volume_total'] = df['ask_volume_1'] + df['ask_volume_2'].fillna(0) + df['ask_volume_3'].fillna(0)
    
    # Calculate volume imbalance
    df['volume_imbalance'] = (df['bid_volume_total'] - df['ask_volume_total']) / (df['bid_volume_total'] + df['ask_volume_total'])
    
    # Calculate weighted average price levels
    df['weighted_bid_price'] = (
        df['bid_price_1'] * df['bid_volume_1'] + 
        df['bid_price_2'].fillna(0) * df['bid_volume_2'].fillna(0) + 
        df['bid_price_3'].fillna(0) * df['bid_volume_3'].fillna(0)
    ) / df['bid_volume_total']
    
    df['weighted_ask_price'] = (
        df['ask_price_1'] * df['ask_volume_1'] + 
        df['ask_price_2'].fillna(0) * df['ask_volume_2'].fillna(0) + 
        df['ask_price_3'].fillna(0) * df['ask_volume_3'].fillna(0)
    ) / df['ask_volume_total']
    
    # Calculate price impact - how much the price would move if a large order came in
    # (simplified version - assumes linear price impact)
    df['bid_price_impact'] = (df['bid_price_1'] - df['bid_price_3'].fillna(df['bid_price_1'])) / df['bid_price_1']
    df['ask_price_impact'] = (df['ask_price_3'].fillna(df['ask_price_1']) - df['ask_price_1']) / df['ask_price_1']
    
    # Calculate order book depth (total volume within first 3 levels)
    df['book_depth'] = df['bid_volume_total'] + df['ask_volume_total']
    
    # Calculate price range (difference between highest ask and lowest bid)
    df['price_range'] = df['ask_price_3'].fillna(df['ask_price_1']) - df['bid_price_3'].fillna(df['bid_price_1'])
    df['relative_price_range'] = df['price_range'] / df['mid_price']
    
    return df

In [None]:
# Calculate orderbook features for the entire dataset
squid_data_with_features = calculate_orderbook_features(squid_data)

# Display the first few rows with the new features
feature_columns = [
    'timestamp', 'mid_price', 'returns', 'abs_returns',
    'spread', 'relative_spread', 'volume_imbalance',
    'bid_volume_total', 'ask_volume_total', 'book_depth',
    'weighted_bid_price', 'weighted_ask_price',
    'bid_price_impact', 'ask_price_impact',
    'price_range', 'relative_price_range'
]

squid_data_with_features[feature_columns].head()

In [None]:
# Extract orderbook states before large returns
# We'll look at the orderbook 1 step before the large return event

# Create a dictionary to store pre-event orderbook states
pre_event_states = {}

for idx in large_return_indices:
    if idx > 0:  # Make sure we're not at the first observation
        # Get the timestamp of the large return event
        event_timestamp = squid_data.loc[idx, 'timestamp']
        
        # Get the return value
        return_value = squid_data.loc[idx, 'returns']
        
        # Get the orderbook state 1 step before the event
        pre_event_idx = idx - 1
        pre_event_state = squid_data_with_features.loc[pre_event_idx]
        
        # Store in dictionary with event timestamp as key
        pre_event_states[event_timestamp] = {
            'pre_event_state': pre_event_state,
            'return_value': return_value
        }

print(f"Extracted {len(pre_event_states)} pre-event orderbook states")

In [None]:
# Convert pre-event states to a DataFrame for easier analysis
pre_event_df = pd.DataFrame({
    'timestamp': [ts for ts in pre_event_states.keys()],
    'return_value': [data['return_value'] for data in pre_event_states.values()],
    'spread': [data['pre_event_state']['spread'] for data in pre_event_states.values()],
    'relative_spread': [data['pre_event_state']['relative_spread'] for data in pre_event_states.values()],
    'volume_imbalance': [data['pre_event_state']['volume_imbalance'] for data in pre_event_states.values()],
    'bid_volume_total': [data['pre_event_state']['bid_volume_total'] for data in pre_event_states.values()],
    'ask_volume_total': [data['pre_event_state']['ask_volume_total'] for data in pre_event_states.values()],
    'book_depth': [data['pre_event_state']['book_depth'] for data in pre_event_states.values()],
    'bid_price_impact': [data['pre_event_state']['bid_price_impact'] for data in pre_event_states.values()],
    'ask_price_impact': [data['pre_event_state']['ask_price_impact'] for data in pre_event_states.values()],
    'price_range': [data['pre_event_state']['price_range'] for data in pre_event_states.values()],
    'relative_price_range': [data['pre_event_state']['relative_price_range'] for data in pre_event_states.values()]
})

# Add a column for return direction (positive or negative)
pre_event_df['return_direction'] = np.where(pre_event_df['return_value'] > 0, 'positive', 'negative')

# Display summary statistics
print("Summary of pre-event orderbook states:")
pre_event_df.describe()