# Analyzing Orderbooks Before Large Returns - Squid Ink Round 2 (Part 3)

This notebook continues the analysis of orderbooks before large returns for Squid Ink in Round 2.

## Part 3: Analyze Relationship Between Orderbook Features and Returns

In this part, we'll analyze the relationship between orderbook features and subsequent returns.

In [None]:
# Import necessary libraries
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Configure plots to be larger and more readable
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Try to import seaborn for better styling
try:
    import seaborn as sns
    sns.set(style="whitegrid")
    print("Using Seaborn for plot styling")
except ImportError:
    print("Seaborn not available, using matplotlib default styling")

### 3.1 Load Data from Part 2

In [None]:
# Create data directory path
data_dir = '../data'

# Load processed data from Part 2
try:
    squid_data_with_features = pd.read_pickle(os.path.join(data_dir, 'squid_data_with_features.pkl'))
    pre_event_df = pd.read_pickle(os.path.join(data_dir, 'pre_event_orderbook_states.pkl'))
    print(f"Successfully loaded data from Part 2")
    print(f"Number of rows in squid_data_with_features: {len(squid_data_with_features)}")
    print(f"Number of pre-event orderbook states: {len(pre_event_df)}")
except FileNotFoundError:
    print("Error: Could not find data files from Part 2.")
    print("Please run Part 2 first to generate the necessary data files.")

### 3.2 Separate Positive and Negative Return Events

In [None]:
# Separate positive and negative return events
positive_returns = pre_event_df[pre_event_df['return_direction'] == 'positive']
negative_returns = pre_event_df[pre_event_df['return_direction'] == 'negative']

print(f"Number of large positive return events: {len(positive_returns)}")
print(f"Number of large negative return events: {len(negative_returns)}")

### 3.3 Compare Orderbook Features Between Positive and Negative Return Events

In [None]:
# Compare orderbook features between positive and negative return events
feature_comparison = pd.DataFrame({
    'positive_mean': positive_returns.mean(),
    'negative_mean': negative_returns.mean(),
    'positive_median': positive_returns.median(),
    'negative_median': negative_returns.median()
})

# Calculate the difference between positive and negative events
feature_comparison['mean_diff'] = feature_comparison['positive_mean'] - feature_comparison['negative_mean']
feature_comparison['median_diff'] = feature_comparison['positive_median'] - feature_comparison['negative_median']

# Calculate the percentage difference
feature_comparison['mean_diff_pct'] = feature_comparison['mean_diff'] / feature_comparison['negative_mean'] * 100
feature_comparison['median_diff_pct'] = feature_comparison['median_diff'] / feature_comparison['negative_median'] * 100

# Display the comparison for relevant features
relevant_features = [
    'spread', 'relative_spread', 'volume_imbalance',
    'bid_volume_total', 'ask_volume_total', 'book_depth',
    'bid_price_impact', 'ask_price_impact',
    'price_range', 'relative_price_range'
]

feature_comparison.loc[relevant_features, ['mean_diff_pct', 'median_diff_pct']].sort_values('mean_diff_pct', ascending=False)

### 3.4 Visualize Distribution of Key Features for Positive vs Negative Returns

In [None]:
# Visualize the distribution of key features for positive vs negative returns
key_features = [
    'volume_imbalance', 'relative_spread', 'book_depth', 'bid_price_impact', 'ask_price_impact'
]

for feature in key_features:
    plt.figure(figsize=(12, 6))
    
    # Plot histograms
    plt.hist(positive_returns[feature].dropna(), bins=20, alpha=0.5, label='Positive Returns')
    plt.hist(negative_returns[feature].dropna(), bins=20, alpha=0.5, label='Negative Returns')
    
    plt.title(f'Distribution of {feature} Before Large Return Events')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

### 3.5 Calculate Correlation Between Orderbook Features and Return Values

In [None]:
# Calculate correlation between orderbook features and return values
correlation = pre_event_df[relevant_features + ['return_value']].corr()['return_value'].drop('return_value')

# Sort by absolute correlation
correlation_sorted = correlation.abs().sort_values(ascending=False)

# Display the correlations
print("Correlation between orderbook features and subsequent returns:")
for feature in correlation_sorted.index:
    print(f"{feature}: {correlation[feature]:.4f}")

# Plot the correlations
plt.figure(figsize=(12, 8))
correlation.sort_values().plot(kind='barh')
plt.title('Correlation Between Orderbook Features and Subsequent Returns')
plt.xlabel('Correlation')
plt.grid(True)
plt.tight_layout()
plt.show()

### 3.6 Create Scatter Plots of Most Correlated Features vs Returns

In [None]:
# Scatter plots of the most correlated features vs returns
top_features = correlation_sorted.index[:3]  # Top 3 most correlated features

for feature in top_features:
    plt.figure(figsize=(10, 6))
    plt.scatter(pre_event_df[feature], pre_event_df['return_value'], alpha=0.6)
    plt.title(f'Relationship Between {feature} and Subsequent Returns')
    plt.xlabel(feature)
    plt.ylabel('Return Value')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

### 3.7 Perform Statistical Tests

In [None]:
# Perform t-tests for each feature
t_test_results = {}

for feature in relevant_features:
    # Get the data for positive and negative returns
    pos_data = positive_returns[feature].dropna()
    neg_data = negative_returns[feature].dropna()
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(pos_data, neg_data, equal_var=False)
    
    # Store results
    t_test_results[feature] = {
        't_statistic': t_stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    }

# Convert to DataFrame for easier viewing
t_test_df = pd.DataFrame(t_test_results).T
t_test_df = t_test_df.sort_values('p_value')

# Display results
print("T-test results for orderbook features (positive vs negative returns):")
t_test_df

### 3.8 Determine Feature Importance Using Machine Learning

In [None]:
# Import machine learning libraries
try:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, classification_report
    print("Successfully imported scikit-learn libraries")
except ImportError:
    print("Warning: scikit-learn not available. Skipping machine learning analysis.")
    has_sklearn = False
else:
    has_sklearn = True

if has_sklearn:
    # Prepare the data
    X = pre_event_df[relevant_features].copy()
    y = (pre_event_df['return_value'] > 0).astype(int)  # 1 for positive returns, 0 for negative

    # Handle missing values
    X = X.fillna(X.mean())

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train a random forest classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Make predictions
    y_pred = rf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Get feature importances
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)

    # Plot feature importances
    plt.figure(figsize=(12, 8))
    plt.barh(feature_importances['feature'], feature_importances['importance'])
    plt.title('Feature Importance for Predicting Return Direction')
    plt.xlabel('Importance')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

### 3.9 Save Data for Next Part

In [None]:
# Save the data for the next part
positive_returns.to_pickle(os.path.join(data_dir, 'positive_return_events.pkl'))
print(f"Saved positive return events to {os.path.join(data_dir, 'positive_return_events.pkl')}")

negative_returns.to_pickle(os.path.join(data_dir, 'negative_return_events.pkl'))
print(f"Saved negative return events to {os.path.join(data_dir, 'negative_return_events.pkl')}")

if has_sklearn:
    feature_importances.to_pickle(os.path.join(data_dir, 'feature_importances.pkl'))
    print(f"Saved feature importances to {os.path.join(data_dir, 'feature_importances.pkl')}")

pd.DataFrame(t_test_results).T.to_pickle(os.path.join(data_dir, 't_test_results.pkl'))
print(f"Saved t-test results to {os.path.join(data_dir, 't_test_results.pkl')}")

## Summary of Part 3

In this third part of the analysis, we have:

1. Loaded the data from Part 2
2. Separated positive and negative return events
3. Compared orderbook features between positive and negative returns
4. Visualized the distribution of key features
5. Calculated correlations between features and returns
6. Performed statistical tests to identify significant differences
7. Used machine learning to determine feature importance
8. Saved the processed data for use in the next part

In Part 4, we will visualize orderbook patterns before large return events.