# S&P 500 LSTM Stock Prediction - Exploration Notebook

This notebook demonstrates the key capabilities of the kevin-lstm project:
- Loading and exploring processed data
- Visualizing technical indicators
- Analyzing model predictions
- Comparing LSTM vs baseline models

## 1. Load and Inspect Processed Data

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import get_default_processed_csv_path

# Load the processed dataset with 12 technical indicators
processed_path = get_default_processed_csv_path()
df = pd.read_csv(processed_path, index_col=0, parse_dates=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"\nFeatures: {list(df.columns)}")
df.head()

## 2. Explore Target Distribution

In [None]:
# Analyze target variables
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Regression target: next_day_return distribution
axes[0].hist(df['next_day_return'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Next-Day Returns (%)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Return (%)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(0, color='red', linestyle='--', label='Zero return')
axes[0].legend()

# Classification target: direction distribution
direction_counts = df['next_day_direction'].value_counts()
axes[1].bar(['Down (0)', 'Up (1)'], direction_counts.values, color=['#d32f2f', '#388e3c'])
axes[1].set_title('Direction Distribution', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')
for i, v in enumerate(direction_counts.values):
    axes[1].text(i, v + 10, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nTarget statistics:")
print(f"Mean return: {df['next_day_return'].mean():.4f}%")
print(f"Std return: {df['next_day_return'].std():.4f}%")
print(f"\nDirection balance:")
print(f"Down days: {direction_counts[0]} ({direction_counts[0]/len(df)*100:.1f}%)")
print(f"Up days: {direction_counts[1]} ({direction_counts[1]/len(df)*100:.1f}%)")

## 3. Visualize Key Technical Indicators

In [None]:
# Plot a subset of technical indicators over time
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# RSI
axes[0].plot(df.index, df['rsi_14'], linewidth=1, color='blue')
axes[0].axhline(70, color='red', linestyle='--', alpha=0.5, label='Overbought')
axes[0].axhline(30, color='green', linestyle='--', alpha=0.5, label='Oversold')
axes[0].set_title('RSI (14-day Relative Strength Index)', fontweight='bold')
axes[0].set_ylabel('RSI')
axes[0].legend()
axes[0].grid(alpha=0.3)

# MACD
axes[1].plot(df.index, df['macd'], linewidth=1, color='blue', label='MACD')
axes[1].bar(df.index, df['macd_h'], alpha=0.3, color='gray', label='MACD Histogram')
axes[1].axhline(0, color='black', linestyle='-', linewidth=0.5)
axes[1].set_title('MACD (Moving Average Convergence Divergence)', fontweight='bold')
axes[1].set_ylabel('MACD')
axes[1].legend()
axes[1].grid(alpha=0.3)

# Bollinger %B
axes[2].plot(df.index, df['bbp'], linewidth=1, color='purple')
axes[2].axhline(1, color='red', linestyle='--', alpha=0.5, label='Upper band')
axes[2].axhline(0, color='green', linestyle='--', alpha=0.5, label='Lower band')
axes[2].set_title('Bollinger %B', fontweight='bold')
axes[2].set_ylabel('%B')
axes[2].set_xlabel('Date')
axes[2].legend()
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Feature Correlation Analysis

In [None]:
# Compute correlation matrix for features
feature_cols = [
    'rsi_14', 'macd', 'macd_h', 'bbl', 'bbp',
    'sma_50', 'ema_20', 'obv', 'close_norm',
    'volume_norm', 'lagged_log_return', 'atr_14'
]

corr_matrix = df[feature_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=0.5,
            cbar_kws={'label': 'Correlation'})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Identify highly correlated features
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j],
                            corr_matrix.iloc[i, j]))

if high_corr:
    print("\nHighly correlated feature pairs (|r| > 0.8):")
    for feat1, feat2, corr in high_corr:
        print(f"  {feat1} <-> {feat2}: {corr:.3f}")

## 5. Load Model Comparison Results

In [None]:
# Load model comparison results
results_path = '../results/model_comparison.csv'
results_df = pd.read_csv(results_path)

print("Model Performance Comparison:")
print("=" * 70)
print(results_df.to_string(index=False))

# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['rmse', 'mae', 'accuracy', 'f1']
titles = ['RMSE (Lower is Better)', 'MAE (Lower is Better)',
          'Accuracy (Higher is Better)', 'F1 Score (Higher is Better)']
colors = ['#e74c3c', '#e67e22', '#27ae60', '#3498db']

for idx, (metric, title, color) in enumerate(zip(metrics, titles, colors)):
    ax = axes[idx // 2, idx % 2]
    bars = ax.bar(results_df['model'], results_df[metric], color=color, alpha=0.7)
    ax.set_title(title, fontweight='bold', fontsize=12)
    ax.set_ylabel(metric.upper())
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 6. Analyze LSTM Learning Curves

In [None]:
# Display the learning curves image
from IPython.display import Image, display

learning_curves_path = '../results/learning_curves.png'
print("LSTM Training Progress:")
display(Image(filename=learning_curves_path))

## 7. Confusion Matrix Comparison

In [None]:
# Display confusion matrices for all models
import glob
from IPython.display import Image, display

confusion_matrices = sorted(glob.glob('../results/confusion_matrix_*.png'))

print(f"Direction Classification Performance ({len(confusion_matrices)} models):")
print("=" * 70)

for cm_path in confusion_matrices:
    model_name = cm_path.split('confusion_matrix_')[1].replace('.png', '').replace('_', ' ').title()
    print(f"\n{model_name}:")
    display(Image(filename=cm_path, width=400))

## 8. Summary and Insights

In [None]:
# Generate insights summary
print("PROJECT SUMMARY")
print("=" * 70)

print(f"\nüìä Dataset:")
print(f"  - Total samples: {len(df):,}")
print(f"  - Date range: {df.index.min().strftime('%Y-%m-%d')} to {df.index.max().strftime('%Y-%m-%d')}")
print(f"  - Features: 12 technical indicators")
print(f"  - Targets: Regression (return %) + Classification (direction)")

print(f"\nü§ñ Models Evaluated:")
for model_name in results_df['model']:
    print(f"  - {model_name}")

# Find best models
best_rmse = results_df.loc[results_df['rmse'].idxmin()]
best_accuracy = results_df.loc[results_df['accuracy'].idxmax()]
best_f1 = results_df.loc[results_df['f1'].idxmax()]

print(f"\nüèÜ Best Performance:")
print(f"  - Best RMSE: {best_rmse['model']} ({best_rmse['rmse']:.4f})")
print(f"  - Best Accuracy: {best_accuracy['model']} ({best_accuracy['accuracy']:.4f})")
print(f"  - Best F1 Score: {best_f1['model']} ({best_f1['f1']:.4f})")

print(f"\n‚úÖ Key Features:")
print(f"  - 2-layer LSTM with dropout regularization")
print(f"  - MinMaxScaler for feature normalization")
print(f"  - 64-day lookback window for temporal patterns")
print(f"  - Early stopping and learning rate reduction")
print(f"  - Reproducible results (random_state=42)")

print(f"\nüìà Visualizations Generated:")
print(f"  - Learning curves (MSE and MAE over epochs)")
print(f"  - Confusion matrices for all models")
print(f"  - Model comparison charts")

print("\n" + "=" * 70)