## Section 3: Analyze Relationship Between Orderbook Features and Returns

Now that we have extracted orderbook features before large return events, let's analyze the relationship between these features and the subsequent returns.

In [None]:
# Separate positive and negative return events
positive_returns = pre_event_df[pre_event_df['return_direction'] == 'positive']
negative_returns = pre_event_df[pre_event_df['return_direction'] == 'negative']

print(f"Number of large positive return events: {len(positive_returns)}")
print(f"Number of large negative return events: {len(negative_returns)}")

In [None]:
# Compare orderbook features between positive and negative return events
feature_comparison = pd.DataFrame({
    'positive_mean': positive_returns.mean(),
    'negative_mean': negative_returns.mean(),
    'positive_median': positive_returns.median(),
    'negative_median': negative_returns.median()
})

# Calculate the difference between positive and negative events
feature_comparison['mean_diff'] = feature_comparison['positive_mean'] - feature_comparison['negative_mean']
feature_comparison['median_diff'] = feature_comparison['positive_median'] - feature_comparison['negative_median']

# Calculate the percentage difference
feature_comparison['mean_diff_pct'] = feature_comparison['mean_diff'] / feature_comparison['negative_mean'] * 100
feature_comparison['median_diff_pct'] = feature_comparison['median_diff'] / feature_comparison['negative_median'] * 100

# Display the comparison for relevant features
relevant_features = [
    'spread', 'relative_spread', 'volume_imbalance',
    'bid_volume_total', 'ask_volume_total', 'book_depth',
    'bid_price_impact', 'ask_price_impact',
    'price_range', 'relative_price_range'
]

feature_comparison.loc[relevant_features, ['mean_diff_pct', 'median_diff_pct']].sort_values('mean_diff_pct', ascending=False)

In [None]:
# Visualize the distribution of key features for positive vs negative returns
key_features = [
    'volume_imbalance', 'relative_spread', 'book_depth', 'bid_price_impact', 'ask_price_impact'
]

for feature in key_features:
    plt.figure(figsize=(12, 6))
    
    # Plot histograms
    plt.hist(positive_returns[feature].dropna(), bins=20, alpha=0.5, label='Positive Returns')
    plt.hist(negative_returns[feature].dropna(), bins=20, alpha=0.5, label='Negative Returns')
    
    plt.title(f'Distribution of {feature} Before Large Return Events')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Calculate correlation between orderbook features and return values
correlation = pre_event_df[relevant_features + ['return_value']].corr()['return_value'].drop('return_value')

# Sort by absolute correlation
correlation_sorted = correlation.abs().sort_values(ascending=False)

# Display the correlations
print("Correlation between orderbook features and subsequent returns:")
for feature in correlation_sorted.index:
    print(f"{feature}: {correlation[feature]:.4f}")

# Plot the correlations
plt.figure(figsize=(12, 8))
correlation.sort_values().plot(kind='barh')
plt.title('Correlation Between Orderbook Features and Subsequent Returns')
plt.xlabel('Correlation')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots of the most correlated features vs returns
top_features = correlation_sorted.index[:3]  # Top 3 most correlated features

for feature in top_features:
    plt.figure(figsize=(10, 6))
    plt.scatter(pre_event_df[feature], pre_event_df['return_value'], alpha=0.6)
    plt.title(f'Relationship Between {feature} and Subsequent Returns')
    plt.xlabel(feature)
    plt.ylabel('Return Value')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

### Statistical Tests

Let's perform statistical tests to determine if there are significant differences in orderbook features before positive vs negative returns.

In [None]:
# Import statistical testing libraries
from scipy import stats

# Perform t-tests for each feature
t_test_results = {}

for feature in relevant_features:
    # Get the data for positive and negative returns
    pos_data = positive_returns[feature].dropna()
    neg_data = negative_returns[feature].dropna()
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(pos_data, neg_data, equal_var=False)
    
    # Store results
    t_test_results[feature] = {
        't_statistic': t_stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    }

# Convert to DataFrame for easier viewing
t_test_df = pd.DataFrame(t_test_results).T
t_test_df = t_test_df.sort_values('p_value')

# Display results
print("T-test results for orderbook features (positive vs negative returns):")
t_test_df

### Feature Importance

Let's use a simple machine learning model to determine feature importance for predicting the direction of large returns.

In [None]:
# Import machine learning libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Prepare the data
X = pre_event_df[relevant_features].copy()
y = (pre_event_df['return_value'] > 0).astype(int)  # 1 for positive returns, 0 for negative

# Handle missing values
X = X.fillna(X.mean())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 8))
plt.barh(feature_importances['feature'], feature_importances['importance'])
plt.title('Feature Importance for Predicting Return Direction')
plt.xlabel('Importance')
plt.grid(True)
plt.tight_layout()
plt.show()

## Summary of Findings

Let's summarize our findings about the relationship between orderbook features and large returns.

In [None]:
# Identify the most significant features based on t-tests
significant_features = t_test_df[t_test_df['significant']].index.tolist()

# Identify the most important features based on the random forest model
top_importance_features = feature_importances.head(5)['feature'].tolist()

# Identify the most correlated features
top_correlated_features = correlation_sorted.head(5).index.tolist()

print("Summary of Findings:")
print("\n1. Statistically significant differences between positive and negative returns:")
for feature in significant_features:
    t_stat = t_test_df.loc[feature, 't_statistic']
    p_value = t_test_df.loc[feature, 'p_value']
    pos_mean = positive_returns[feature].mean()
    neg_mean = negative_returns[feature].mean()
    print(f"   - {feature}: t={t_stat:.4f}, p={p_value:.4f}")
    print(f"     Positive returns mean: {pos_mean:.4f}, Negative returns mean: {neg_mean:.4f}")

print("\n2. Top features by correlation with returns:")
for feature in top_correlated_features:
    print(f"   - {feature}: {correlation[feature]:.4f}")

print("\n3. Top features by importance in predicting return direction:")
for feature in top_importance_features:
    importance = feature_importances[feature_importances['feature'] == feature]['importance'].values[0]
    print(f"   - {feature}: {importance:.4f}")

print("\n4. Model performance:")
print(f"   - Accuracy: {accuracy:.4f}")

print("\n5. Key insights:")
# These will be filled in based on the actual results from the analysis

## Save Results

Let's save the pre-event orderbook states and analysis results for future use.

In [None]:
# Create output directory if it doesn't exist
output_dir = '../data'
os.makedirs(output_dir, exist_ok=True)

# Save pre-event orderbook states
pre_event_df.to_csv(os.path.join(output_dir, 'squid_pre_event_orderbook_states.csv'), index=False)
print(f"Pre-event orderbook states saved to {os.path.join(output_dir, 'squid_pre_event_orderbook_states.csv')}")

# Save feature importance results
feature_importances.to_csv(os.path.join(output_dir, 'squid_feature_importances.csv'), index=False)
print(f"Feature importances saved to {os.path.join(output_dir, 'squid_feature_importances.csv')}")

# Save t-test results
t_test_df.to_csv(os.path.join(output_dir, 'squid_t_test_results.csv'))
print(f"T-test results saved to {os.path.join(output_dir, 'squid_t_test_results.csv')}")