# Exploratory Data Analysis and Feature Engineering

This notebook explores the `processed_data.csv` file, visualizes key relationships, and engineers new features for our machine learning models.

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
# Note: You must run `run_pipeline.py --skip-db` first to generate this file.
try:
    df = pd.read_csv('../data/processed_data.csv', parse_dates=['Date'])
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: `data/processed_data.csv` not found.")
    print("Please run `run_pipeline.py --skip-db` from the root directory first.")
    df = None

if df is not None:
    print("\n--- DataFrame Info ---")
    df.info()

In [None]:
if df is not None:
    print("--- Descriptive Statistics ---")
    display(df.describe())

In [None]:
if df is not None:
    print("--- Correlation Matrix ---")
    numeric_cols = df.select_dtypes(include=np.number)
    correlation_matrix = numeric_cols.corr()
    display(correlation_matrix[['Close']].sort_values(by='Close', ascending=False))

## Feature Engineering

In [None]:
if df is not None:
    print("--- Creating New Features ---")
    # Sort by Ticker and Date to ensure correct order for time-series features
    df.sort_values(by=['Ticker', 'Date'], inplace=True)
    
    # Lagged Price Features
    df['price_change_1d'] = df.groupby('Ticker')['Close'].pct_change(1)
    df['price_change_5d'] = df.groupby('Ticker')['Close'].pct_change(5)
    
    # Lagged Sentiment Features
    df['vader_1d_lag'] = df.groupby('Ticker')['vader_avg_score'].shift(1)
    df['finbert_1d_lag'] = df.groupby('Ticker')['finbert_avg_score'].shift(1)
    
    # Moving Averages
    df['sma_7d'] = df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=7).mean())
    df['sma_30d'] = df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=30).mean())
    
    # RSI (Relative Strength Index)
    def calculate_rsi(series, window=14):
        delta = series.diff(1)
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))
    df['rsi_14d'] = df.groupby('Ticker')['Close'].transform(lambda x: calculate_rsi(x))
    
    print("New features created.")

## Target Variable Creation

In [None]:
if df is not None:
    print("--- Creating Target Variable ---")
    # The target is to predict if the next day's price will go up (1) or down (0).
    df['next_day_close'] = df.groupby('Ticker')['Close'].shift(-1)
    df['target'] = (df['next_day_close'] > df['Close']).astype(int)
    
    print("Target variable 'target' created.")

## Final Dataset Preparation

In [None]:
if df is not None:
    print("--- Preparing Final ML Dataset ---")
    # Drop rows with NaN values created by lags and rolling windows
    ml_df = df.dropna()
    
    # Select feature columns and the target
    feature_cols = [
        'price_change_1d', 'price_change_5d', 
        'vader_1d_lag', 'finbert_1d_lag',
        'sma_7d', 'sma_30d', 'rsi_14d',
        'article_count'
    ]
    final_df = ml_df[feature_cols + ['target', 'Date', 'Ticker']]
    
    # Save the final dataset
    output_path = '../data/ml_ready_data.csv'
    final_df.to_csv(output_path, index=False)
    
    print(f"Final ML-ready dataset created with {len(final_df)} rows.")
    print(f"Saved to {output_path}")
    display(final_df.head())