Importing Libraries and defining Tickers

In [5]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import talib  
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL', 'SPY']#FAANG portfolio with SPY for market proxy

In [6]:
class DataPreprocessor:
    def __init__(self, tickers):
        self.tickers = tickers
        self.data = None
        
    def run_pipeline(self, start_date='2015-01-01', end_date='2024-12-31'):
        self._download_data(start_date, end_date)
        self._clean_data()
        self._validate_data()
        return self.data
    
    def _download_data(self, start_date, end_date):
        all_data = []
        
        for ticker in self.tickers:
            try:
                # Download with adjusted close prices and make column names lower
                df = yf.download(ticker, start=start_date, end=end_date, progress=False, auto_adjust=True)
                
                # Fix: Handle MultiIndex columns and lowercase everything
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = [col[0].lower() for col in df.columns]
                else:
                    df.columns = [col.lower() for col in df.columns]
                
                # Fix: Ensure 'Date' index is named 'date' for consistency
                df.index.names = [name.lower() if name else 'date' for name in df.index.names]
                
                df = df[['open', 'high', 'low', 'close', 'volume']]# Keep only essential columns
                
                # Add ticker column and set multi-index, organize data
                df['ticker'] = ticker
                df = df.reset_index()
                df.set_index(['ticker', 'date'], inplace=True)
                
                all_data.append(df)
                
            except Exception as e:
                print("failure")
        
        # Combine all tickers
        if all_data:
            self.data = pd.concat(all_data, axis=0).sort_index()
        else:
            raise ValueError("No data downloaded!")
    
    def _clean_data(self):
        if self.data is None:
            raise ValueError("No data to clean!")
        
        # 1. Fill small gaps (forward fill then backward fill)
        self.data = self.data.groupby(level=0, group_keys=False).apply(lambda x: x.ffill().bfill())
        
        # 2. Drop any remaining NaN (usually at the beginning)
        self.data = self.data.dropna()
        
        # 3. Basic validation checks
        issues = []
        # No negative prices
        price_cols = ['open', 'high', 'low', 'close']
        if (self.data[price_cols] < 0).any().any():
            issues.append("Negative prices")
        # High >= Low
        if (self.data['high'] < self.data['low']).any().any():
            issues.append("High < Low")
        # Volume positive
        if (self.data['volume'] <= 0).any().any():
            issues.append("Zero/Negative volume")

