Importing Libraries and defining Tickers

In [9]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import talib  
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL', 'SPY']#FAANG portfolio with SPY for market proxy

In [10]:
class DataPreprocessor:
    def __init__(self, tickers,):
        self.tickers = tickers
        self.data = None
        
    def run_pipeline(self, start_date='2015-01-01', end_date='2024-12-31'):
        self._download_data(start_date, end_date)
        self._clean_data()
        self._validate_data()
        return self.data
    
    def _download_data(self, start_date, end_date):
        all_data = []
        
        for ticker in self.tickers:
            
            # Download with adjusted close prices and make column names lower
            df = yf.download(ticker, start=start_date, end=end_date, progress=False, auto_adjust=True)
            
            if isinstance(df.columns, pd.MultiIndex): 
                df.columns = [col[0].lower() for col in df.columns]
            else:
                df.columns = [col.lower() for col in df.columns]
            
            df.index.names = [name.lower() if name else 'date' for name in df.index.names]
            
            df = df[['open', 'high', 'low', 'close', 'volume']] # Keep only essential columns
            
            df['ticker'] = ticker # Add ticker column and set multi-index, organize data
            df = df.reset_index()
            df.set_index(['ticker', 'date'], inplace=True)
            
            all_data.append(df)
        
        self.data = pd.concat(all_data, axis=0).sort_index()

    def _clean_data(self):
        if self.data is None:
            raise ValueError("No data to clean!")
        
        # 1. Fill small gaps (forward fill then backward fill)
        self.data = self.data.groupby(level=0, group_keys=False).apply(lambda x: x.ffill().bfill())
        
        # 2. Drop any remaining NaN (usually at the beginning)
        self.data = self.data.dropna()
        
        # 3. Basic validation checks
        issues = []
        # No negative prices
        price_cols = ['open', 'high', 'low', 'close']
        if (self.data[price_cols] < 0).any().any():
            issues.append("Negative prices")
        # High >= Low
        if (self.data['high'] < self.data['low']).any():
            issues.append("High < Low")
        # Volume positive
        if (self.data['volume'] <= 0).any():
            issues.append("Zero/Negative volume")
        
        if issues:
            print("issues found")
        else:
            print("quality passed")
    
    def _validate_data(self):
        # Check date alignment
        date_counts = {}
        for ticker in self.data.index.get_level_values(0).unique():
            dates = self.data.xs(ticker, level=0).index
            date_counts[ticker] = len(dates)
        
        if len(set(date_counts.values())) == 1:
            print(f"All tickers have {list(date_counts.values())[0]} trading days")
        else:
            print("tickers have different dates")
        
        # Quick statistical check
        sample_ticker = self.data.index.get_level_values(0).unique()[0]
        print(f"\nðŸ“Š Sample statistics ({sample_ticker} close prices):")
        sample_close = self.data.xs(sample_ticker, level=0)['close']
        print(f"    â€¢ Mean: ${sample_close.mean():.2f}")
        print(f"    â€¢ Max:  ${sample_close.max():.2f}")

# Create an instance and run the pipeline
preprocessor = DataPreprocessor(tickers)
data = preprocessor.run_pipeline(start_date='2015-01-01', end_date='2024-12-31')

# Preview the data
print(data.shape)
print(data.index.get_level_values('ticker').unique().tolist())
print(data.index.get_level_values('date').min(), "to", data.index.get_level_values('date').max())
data.head(10)

quality passed
All tickers have 2515 trading days

ðŸ“Š Sample statistics (AAPL close prices):
    â€¢ Mean: $93.80
    â€¢ Max:  $257.85
(15090, 5)
['AAPL', 'AMZN', 'GOOGL', 'META', 'NFLX', 'SPY']
2015-01-02 00:00:00 to 2024-12-30 00:00:00


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2015-01-02,24.694239,24.705324,23.798604,24.237555,212818400
AAPL,2015-01-05,24.006994,24.086803,23.368523,23.554743,257142000
AAPL,2015-01-06,23.619038,23.816344,23.195606,23.556965,263188400
AAPL,2015-01-07,23.765352,23.987044,23.654506,23.887283,160423600
AAPL,2015-01-08,24.215391,24.86273,24.097893,24.80509,237458000
AAPL,2015-01-09,24.978,25.106581,24.432638,24.831684,214798000
AAPL,2015-01-12,24.962475,24.969126,24.120048,24.219809,198603200
AAPL,2015-01-13,24.703107,25.006825,24.144445,24.43486,268367600
AAPL,2015-01-14,24.173261,24.494713,24.053547,24.341747,195826400
AAPL,2015-01-15,24.386084,24.399385,23.645635,23.681105,240056000
