# Data Exploration for LLM Finance Predictor

This notebook explores the financial data used for training the LLM Finance Predictor model.

## Overview
- Load and examine raw financial data
- Analyze price patterns and technical indicators
- Explore news data and sentiment analysis
- Visualize data distributions and correlations


In [None]:
# Import necessary libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import yfinance as yf

# Import our custom modules
from data.loader import FinancialDataLoader
from data.preprocess import FinancialDataPreprocessor
from utils.metrics import FinancialMetrics

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 1. Load Financial Data

Let's start by loading some sample financial data for exploration.


In [None]:
# Initialize data loader
loader = FinancialDataLoader()

# Define symbols and date range
symbols = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN']
start_date = '2023-01-01'
end_date = '2023-12-31'

print(f"Loading data for symbols: {symbols}")
print(f"Date range: {start_date} to {end_date}")

# Load price data
price_data = loader.load_price_data(symbols, start_date, end_date)

print(f"\nLoaded data for {len(price_data)} symbols")
for symbol, df in price_data.items():
    print(f"{symbol}: {len(df)} records")


## 2. Explore Price Data

Let's examine the price data structure and basic statistics.


In [None]:
# Examine price data structure
symbol = 'AAPL'
df = price_data[symbol]

print(f"Price data structure for {symbol}:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Date range: {df.index.min()} to {df.index.max()}")

print(f"\nFirst few rows:")
print(df.head())

print(f"\nBasic statistics:")
print(df.describe())


## 3. Visualize Price Trends

Let's create visualizations to understand price patterns.


In [None]:
# Plot price trends for all symbols
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, (symbol, df) in enumerate(price_data.items()):
    if i < len(axes):
        ax = axes[i]
        ax.plot(df.index, df['Close'], label='Close Price', linewidth=2)
        ax.set_title(f'{symbol} Price Trend')
        ax.set_xlabel('Date')
        ax.set_ylabel('Price ($)')
        ax.legend()
        ax.grid(True, alpha=0.3)

# Remove empty subplot
if len(price_data) < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()


## 4. Calculate Technical Indicators

Let's add technical indicators to our data and explore them.


In [None]:
# Initialize preprocessor
preprocessor = FinancialDataPreprocessor()

# Process data for one symbol
symbol = 'AAPL'
df = price_data[symbol].copy()

# Clean data
df_clean = preprocessor.clean_price_data(df)

# Add technical indicators
df_with_indicators = preprocessor.create_technical_indicators(df_clean)

# Add target variables
df_with_targets = preprocessor.create_target_variables(df_with_indicators)

print(f"Data shape after processing: {df_with_targets.shape}")
print(f"New columns added: {[col for col in df_with_targets.columns if col not in df.columns]}")

# Show sample of processed data
print(f"\nSample of processed data:")
print(df_with_targets[['Close', 'SMA_20', 'RSI', 'MACD', 'price_direction_1d']].head(10))


## 5. Visualize Technical Indicators

Let's create visualizations for technical indicators.


In [None]:
# Plot technical indicators
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

# Price with moving averages
ax1 = axes[0]
ax1.plot(df_with_targets.index, df_with_targets['Close'], label='Close Price', linewidth=2)
ax1.plot(df_with_targets.index, df_with_targets['SMA_20'], label='SMA 20', alpha=0.7)
ax1.plot(df_with_targets.index, df_with_targets['SMA_50'], label='SMA 50', alpha=0.7)
ax1.set_title(f'{symbol} - Price and Moving Averages')
ax1.set_ylabel('Price ($)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# RSI
ax2 = axes[1]
ax2.plot(df_with_targets.index, df_with_targets['RSI'], label='RSI', color='orange', linewidth=2)
ax2.axhline(y=70, color='r', linestyle='--', alpha=0.7, label='Overbought (70)')
ax2.axhline(y=30, color='g', linestyle='--', alpha=0.7, label='Oversold (30)')
ax2.set_title('RSI (Relative Strength Index)')
ax2.set_ylabel('RSI')
ax2.set_ylim(0, 100)
ax2.legend()
ax2.grid(True, alpha=0.3)

# MACD
ax3 = axes[2]
ax3.plot(df_with_targets.index, df_with_targets['MACD'], label='MACD', linewidth=2)
ax3.plot(df_with_targets.index, df_with_targets['MACD_signal'], label='Signal', linewidth=2)
ax3.bar(df_with_targets.index, df_with_targets['MACD_histogram'], label='Histogram', alpha=0.6)
ax3.set_title('MACD (Moving Average Convergence Divergence)')
ax3.set_xlabel('Date')
ax3.set_ylabel('MACD')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 6. Analyze Target Variables

Let's examine the target variables we created for prediction.


In [None]:
# Analyze target variables
target_columns = [col for col in df_with_targets.columns if 'price_direction' in col or 'price_change' in col]

print("Target variable analysis:")
for col in target_columns:
    if col in df_with_targets.columns:
        print(f"\n{col}:")
        print(f"  Non-null values: {df_with_targets[col].notna().sum()}")
        print(f"  Unique values: {df_with_targets[col].nunique()}")
        if 'direction' in col:
            print(f"  Value counts:")
            print(df_with_targets[col].value_counts())
        else:
            print(f"  Statistics:")
            print(df_with_targets[col].describe())

# Plot distribution of direction predictions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Direction distribution
ax1 = axes[0]
direction_counts = df_with_targets['price_direction_1d'].value_counts()
ax1.pie(direction_counts.values, labels=['Down', 'Up'], autopct='%1.1f%%', startangle=90)
ax1.set_title('Price Direction Distribution (1 day)')

# Price change distribution
ax2 = axes[1]
ax2.hist(df_with_targets['price_change_1d'].dropna(), bins=50, alpha=0.7, edgecolor='black')
ax2.set_title('Price Change Distribution (1 day)')
ax2.set_xlabel('Price Change')
ax2.set_ylabel('Frequency')
ax2.axvline(x=0, color='r', linestyle='--', alpha=0.7, label='No Change')
ax2.legend()

plt.tight_layout()
plt.show()


## 7. Correlation Analysis

Let's examine correlations between features and targets.


In [None]:
# Select numeric columns for correlation analysis
numeric_columns = df_with_targets.select_dtypes(include=[np.number]).columns
correlation_data = df_with_targets[numeric_columns].corr()

# Plot correlation heatmap
plt.figure(figsize=(15, 12))
mask = np.triu(np.ones_like(correlation_data, dtype=bool))
sns.heatmap(correlation_data, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Focus on correlations with target variables
target_correlations = correlation_data[['price_direction_1d', 'price_change_1d']].drop(['price_direction_1d', 'price_change_1d'])
target_correlations = target_correlations.sort_values('price_direction_1d', key=abs, ascending=False)

print("Top correlations with price direction (1 day):")
print(target_correlations['price_direction_1d'].head(10))


## 8. Volatility Analysis

Let's analyze volatility patterns in the data.


In [None]:
# Calculate daily returns and volatility
df_with_targets['daily_return'] = df_with_targets['Close'].pct_change()
df_with_targets['volatility_20d'] = df_with_targets['daily_return'].rolling(window=20).std()

# Plot volatility
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Daily returns
ax1 = axes[0]
ax1.plot(df_with_targets.index, df_with_targets['daily_return'], alpha=0.7, linewidth=1)
ax1.set_title(f'{symbol} - Daily Returns')
ax1.set_ylabel('Daily Return')
ax1.axhline(y=0, color='r', linestyle='--', alpha=0.5)
ax1.grid(True, alpha=0.3)

# Rolling volatility
ax2 = axes[1]
ax2.plot(df_with_targets.index, df_with_targets['volatility_20d'], color='red', linewidth=2)
ax2.set_title('20-Day Rolling Volatility')
ax2.set_xlabel('Date')
ax2.set_ylabel('Volatility')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Volatility statistics
print("Volatility Statistics:")
print(f"Average daily return: {df_with_targets['daily_return'].mean():.4f}")
print(f"Daily return std: {df_with_targets['daily_return'].std():.4f}")
print(f"Average 20-day volatility: {df_with_targets['volatility_20d'].mean():.4f}")
print(f"Max 20-day volatility: {df_with_targets['volatility_20d'].max():.4f}")


## 9. Summary and Insights

Let's summarize our findings and prepare for model training.


In [None]:
# Summary statistics
print("=== DATA EXPLORATION SUMMARY ===")
print(f"Symbol analyzed: {symbol}")
print(f"Data period: {df_with_targets.index.min().date()} to {df_with_targets.index.max().date()}")
print(f"Total records: {len(df_with_targets)}")
print(f"Features created: {len([col for col in df_with_targets.columns if col not in ['Open', 'High', 'Low', 'Close', 'Volume']])}")

print(f"\nTarget variable distribution:")
print(f"Price direction (1 day): {df_with_targets['price_direction_1d'].value_counts().to_dict()}")

print(f"\nKey insights:")
print(f"1. Average daily return: {df_with_targets['daily_return'].mean():.4f}")
print(f"2. Average volatility: {df_with_targets['volatility_20d'].mean():.4f}")
print(f"3. RSI range: {df_with_targets['RSI'].min():.1f} - {df_with_targets['RSI'].max():.1f}")

# Save processed data for training
output_path = '../data/processed/explored_data.csv'
df_with_targets.to_csv(output_path)
print(f"\nProcessed data saved to: {output_path}")

print("\n=== READY FOR MODEL TRAINING ===")
print("The data has been cleaned, features engineered, and target variables created.")
print("Next steps: Use this data for training the LLM Finance Predictor model.")
