# Feature Engineering for Stock Prediction

This notebook handles feature engineering and analysis for stock price prediction, including technical indicators and correlation analysis across target and peer assets.

In [1]:
# Standard library imports
import sys
import os
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Add project root to path
sys.path.append('../')

# Third-party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

# Local imports from src
from src.data_loader import StockDataLoader
from src.features import FeatureEngineer

# Configure visualization settings
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [15, 8]
plt.rcParams['figure.dpi'] = 100

logger.info("Imports and configurations loaded successfully")

2025-06-01 09:29:30,632 - INFO - Imports and configurations loaded successfully


## 1. Data Loading

## Import and create stock features

In [2]:
# Add logging configuration
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Load configuration
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Initialize components
data_loader = StockDataLoader('../config.yaml')
feature_engineer = FeatureEngineer(config)  # Pass config to FeatureEngineer

# Set date range
end_date = datetime.now()
start_date = datetime(2024, 1, 1)  # Start from beginning of 2024

# Get symbols from config
# Get symbols from config and filter out target from peers
target_symbol = config['target_symbol']
peer_symbols = [symbol for symbol in config['peer_symbols'] if symbol != target_symbol]

# Log the filtered symbols
logger.info(f"Target Symbol: {target_symbol}")
logger.info(f"Filtered Peer Symbols: {', '.join(peer_symbols)}")

# Load target asset data
target_asset = data_loader.fetch_stock_data(target_symbol, 
                                          start_date.strftime('%Y-%m-%d'),
                                          end_date.strftime('%Y-%m-%d'))

# Load peer assets data
peer_assets = {}
for symbol in peer_symbols:
    peer_assets[symbol] = data_loader.fetch_stock_data(symbol,
                                                      start_date.strftime('%Y-%m-%d'),
                                                      end_date.strftime('%Y-%m-%d'))

# Calculate features for target asset
target_features = feature_engineer.engineer_features(target_asset)
target_features.columns = [f'{target_symbol}_{col}' for col in target_features.columns]

# Calculate and combine features for peer assets
peer_features_list = []
for symbol, data in peer_assets.items():
    features = feature_engineer.engineer_features(data)
    features.columns = [f'{symbol}_{col}' for col in features.columns]
    peer_features_list.append(features)

# Combine all peer features into a single DataFrame

peer_features = pd.concat(peer_features_list, axis=1)

# Display first few rows of features
target_features.head()
peer_features.head()

# Combine target and peer features
all_features = pd.concat([target_features, peer_features], axis=1)

# Display first few rows of combined features
all_features.head()

# Create data/processed directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save features to pickle file
all_features.to_pickle('../data/processed/all_features.pkl')
print("\nFeatures saved to '../data/processed/all_features.pkl'")

2025-06-01 09:29:30,693 - INFO - Target Symbol: MSFT
2025-06-01 09:29:30,695 - INFO - Filtered Peer Symbols: AAPL, AMZN, GOOGL, META, ORCL, IBM, CRM, ADBE, NVDA, INTC, SPY, QQQ, XLK, VTI



Features saved to '../data/processed/all_features.pkl'


## Import and create sentiment features

## Note on Sentiment Analysis
Sentiment analysis has been temporarily removed due to API limitations:
- Free tier only provides 30 days of historical data
- Paid API access is cost-prohibitive for this project
- Focus will be on technical indicators for initial model development

Future enhancements may include:
- Alternative news data sources
- Social media sentiment analysis
- Web scraping implementation

In [3]:
# Load configuration and initialize sentiment analyzer with validation
try:
    with open('../config.yaml', 'r') as file:
        config = yaml.safe_load(file)
    
    # Verify sentiment configuration exists
    if 'sentiment' not in config:
        raise ValueError("Missing 'sentiment' section in config.yaml")
    
    # Verify API key exists and is not empty
    if not config['sentiment'].get('api_key'):
        raise ValueError("Missing or empty API key in sentiment configuration")
    
    # Initialize sentiment analyzer with verified config
    from src.features.sentiment.analyzer import SentimentAnalyzer
    sentiment_analyzer = SentimentAnalyzer(config)
    
    logger.info("Sentiment analyzer initialized successfully")

except Exception as e:
    logger.error(f"Failed to initialize sentiment analyzer: {str(e)}")
    raise


2025-06-01 09:29:40,142 - INFO - Initialized SentimentAnalyzer with endpoint: https://newsapi.org/v2
2025-06-01 09:29:40,143 - INFO - Sentiment analyzer initialized successfully


In [11]:
# Use historical dates within NewsAPI's 30-day limit
end_date = datetime.now() 
start_date = end_date - timedelta(days=29)     # Last 29 days (total 30 days with end date)

print(f"\nFetching sentiment data for MSFT from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

try:
    features = sentiment_analyzer.get_sentiment_features('MSFT', end_date, start_date)
    
    # Debug output
    print("\nFeatures retrieved:")
    for key, value in features.items():
        if key != 'articles':  # Skip printing full article list
            print(f"{key}: {value}")
        else:
            print(f"Total articles: {len(value)}")
            if len(value) > 0:
                print("\nSample articles:")
                for article in value[:3]:
                    print(f"- {article['publishedAt']}: {article['title']}")
                    
except Exception as e:
    logger.error(f"Error fetching sentiment data: {str(e)}")


Fetching sentiment data for MSFT from 2025-05-03 to 2025-06-01


2025-06-01 09:34:13,758 - ERROR - Error fetching news data: 426 Client Error: Upgrade Required for url: https://newsapi.org/v2/everything?q=%22MSFT%22+AND+%28stock+OR+market+OR+trading%29&from=2025-05-03&to=2025-06-01&language=en&sortBy=publishedAt&page=2&pageSize=100&apiKey=7b5607dfb91a4cce947e77392bd7b58f



Features retrieved:
sentiment_score: 0.6068159999999999
sentiment_magnitude: 0.17207999999999998
article_count: 100
Total articles: 100

Sample articles:
- 2025-05-31T11:01:03Z: Top Technology Stocks To Follow Today – May 29th
- 2025-05-31T11:00:00Z: Nvidia can't be stopped, Apple falls behind, and the AI data center race: Tech news roundup
- 2025-05-31T08:46:01Z: BI Investor of the Month: Meet the fund manager up 28% over the last year


In [None]:
def test_sentiment_dates(symbol, start_date, end_date):
    """Test sentiment retrieval for a date range"""
    print(f"\nTesting sentiment for {symbol} from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    
    try:
        # Get raw sentiment data
        raw_data = sentiment_analyzer.get_raw_sentiment(symbol)
        
        if raw_data and 'articles' in raw_data:
            # Filter articles by date range after retrieval
            articles = raw_data['articles']
            filtered_articles = [
                article for article in articles
                if start_date.strftime('%Y-%m-%d') <= article.get('publishedAt', '').split('T')[0] <= end_date.strftime('%Y-%m-%d')
            ]
            
            # Process filtered articles
            if filtered_articles:
                dates = [article.get('publishedAt', '').split('T')[0] for article in filtered_articles]
                
                print(f"\nArticle date distribution:")
                date_counts = pd.Series(dates).value_counts().sort_index()
                print(date_counts)
                
                print(f"\nDate range summary:")
                print(f"Total articles retrieved: {len(articles)}")
                print(f"Articles in date range: {len(filtered_articles)}")
                print(f"Earliest article: {min(dates) if dates else 'No articles'}")
                print(f"Latest article: {max(dates) if dates else 'No articles'}")
                
                # Sample some articles
                if filtered_articles:
                    print("\nSample articles:")
                    for article in filtered_articles[:3]:
                        print(f"- {article['publishedAt']}: {article['title']}")
                
                return {'articles': filtered_articles, 'totalResults': len(filtered_articles)}
            
        print("\nNo articles found in date range")
        return None
        
    except Exception as e:
        logger.error(f"Failed to retrieve sentiment data: {str(e)}")
        return None

# Test with realistic dates (last 7 days)
test_week_start = datetime.now() - timedelta(days=7)
test_week_end = datetime.now()
print("\nTesting last week:")
weekly_data = test_sentiment_dates('MSFT', test_week_start, test_week_end)

# Test with last month (Limited to last 30 days by NewsAPI)
test_month_start = datetime.now() - timedelta(days=30)
test_month_end = datetime.now()
print("\nTesting last month:")
monthly_data = test_sentiment_dates('MSFT', test_month_start, test_month_end)

In [None]:
# Modified weekly sentiment calculation
weekly_sentiments = []
for date in tqdm(pd.date_range(start_date, end_date, freq='W'), desc="Processing MSFT"):
    try:
        # Get sentiment for the week
        week_start = date - timedelta(days=7)
        features = sentiment_analyzer.get_sentiment_features('MSFT', date, week_start)
        
        # Add date range to debug output
        print(f"\nWeek {week_start.strftime('%Y-%m-%d')} to {date.strftime('%Y-%m-%d')}:")
        if 'articles' in features:
            print(f"Articles found: {len(features['articles'])}")
            for article in features['articles'][:3]:
                print(f"- {article['publishedAt']}: {article['title']}")
        
        weekly_sentiments.append({
            'sentiment_score': features['sentiment_score'],
            'sentiment_magnitude': features['sentiment_magnitude'],
            'article_count': features['article_count']
        })
        
    except Exception as e:
        logger.error(f"Error for week of {date}: {str(e)}")
        logger.error(f"Full traceback: {traceback.format_exc()}")
        weekly_sentiments.append({
            'sentiment_score': None,
            'sentiment_magnitude': None,
            'article_count': None
        })

## Visualize sentiment data

In [None]:

# Plot time series
fig, axes = plt.subplots(3, 1, figsize=(15, 15))

# Plot 1: Sentiment scores over time
for symbol in [target_symbol] + peer_symbols:
    axes[0].plot(sentiment_data.index, 
                 sentiment_data[f'{symbol}_sentiment_score'],
                 label=symbol, alpha=0.7)
axes[0].set_title('Sentiment Scores Over Time')
axes[0].set_ylabel('Sentiment Score')
axes[0].grid(True, alpha=0.3)
axes[0].legend()

# Plot 2: Sentiment magnitude over time
for symbol in [target_symbol] + peer_symbols:
    axes[1].plot(sentiment_data.index, 
                 sentiment_data[f'{symbol}_sentiment_magnitude'],
                 label=symbol, alpha=0.7)
axes[1].set_title('Sentiment Magnitude Over Time')
axes[1].set_ylabel('Magnitude')
axes[1].grid(True, alpha=0.3)
axes[1].legend()

# Plot 3: Article count over time
for symbol in [target_symbol] + peer_symbols:
    axes[2].plot(sentiment_data.index, 
                 sentiment_data[f'{symbol}_article_count'],
                 label=symbol, alpha=0.7)
axes[2].set_title('Article Count Over Time')
axes[2].set_ylabel('Number of Articles')
axes[2].grid(True, alpha=0.3)
axes[2].legend()

plt.tight_layout()
plt.show()

# Save sentiment time series data
sentiment_data.to_pickle('../data/processed/sentiment_timeseries.pkl')
print("\nSentiment time series data saved to '../data/processed/sentiment_timeseries.pkl'")

# Display summary statistics
print("\nSentiment Statistics:")
print(sentiment_data.describe().round(3))

# Calculate rolling correlations with technical indicators
window = 7  # 7-day rolling window
rolling_correlations = pd.DataFrame()

for symbol in [target_symbol] + peer_symbols:
    sentiment_series = sentiment_data[f'{symbol}_sentiment_score']
    tech_indicators = all_features[[col for col in all_features.columns if symbol in col and any(
        indicator in col for indicator in ['RSI', 'MACD', 'Signal']
    )]]
    
    for col in tech_indicators.columns:
        rolling_corr = sentiment_series.rolling(window).corr(tech_indicators[col])
        rolling_correlations[f'{symbol}_{col}_correlation'] = rolling_corr

print("\nRolling Correlations Summary:")
print(rolling_correlations.describe().round(3))

In [None]:
sentiment_df.head(20)

## 2. Technical Indicators (Optional)

In [None]:
## 2. Technical Indicators

# First, let's print the available columns
print("Available columns:")
print([col for col in all_features.columns if 'RSI' in col])

# Then update the plotting code with the correct column name
plt.figure(figsize=(15, 6))
plt.plot(all_features.index, all_features[f'{target_symbol}_RSI'])  # Removed _14 suffix
plt.axhline(y=70, color='r', linestyle='--', label='Overbought (70)')
plt.axhline(y=30, color='g', linestyle='--', label='Oversold (30)')
plt.title(f'RSI Over Time - {target_symbol}')
plt.legend()
plt.show()

# Plot MACD for target asset
plt.figure(figsize=(15, 6))
plt.plot(all_features.index, all_features[f'{target_symbol}_MACD'], label='MACD')  # Simplified column names
plt.plot(all_features.index, all_features[f'{target_symbol}_Signal'], label='Signal')
plt.bar(all_features.index, all_features[f'{target_symbol}_Histogram'], label='Histogram')
plt.title(f'MACD Analysis - {target_symbol}')
plt.legend()
plt.show()

In [None]:
# Compare MACD across peers
plt.figure(figsize=(15, 6))
for symbol in [target_symbol] + peer_symbols:
    plt.plot(all_features.index, all_features[f'{symbol}_MACD'], label=f'{symbol}_MACD')
    plt.plot(all_features.index, all_features[f'{symbol}_Signal'], label=f'{symbol}_Signal')
    plt.bar(all_features.index, all_features[f'{symbol}_Histogram'], label=f'{symbol}_Histogram', alpha=0.3)
plt.title('MACD Analysis Comparison')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 3. Feature Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = all_features.corr()

# Create a larger figure for better readability
plt.figure(figsize=(20, 16))

# Plot correlation heatmap
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            fmt='.2f',
            square=True,
            linewidths=0.5)

plt.title('Feature Correlation Matrix Across All Assets')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Display summary of highly correlated features (|correlation| > 0.7)
high_corr = np.where(np.abs(correlation_matrix) > 0.7)
high_corr = [(correlation_matrix.index[x], correlation_matrix.columns[y], correlation_matrix.iloc[x, y]) 
             for x, y in zip(*high_corr) if x != y]

print("\nHighly correlated features (|correlation| > 0.7):")
for feat1, feat2, corr in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True):
    print(f"{feat1} <-> {feat2}: {corr:.3f}")