In [14]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import talib  
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL', 'SPY']#FAANG portfolio with SPY for market proxy

In [15]:
class DataPreprocessor:
    def __init__(self, tickers,):
        self.tickers = tickers
        self.data = None
        
    def run_pipeline(self, start_date='2015-01-01', end_date='2024-12-31'):
        self._download_data(start_date, end_date)
        self._clean_data()
        self._validate_data()
        return self.data
    
    def _download_data(self, start_date, end_date):
        all_data = []
        
        for ticker in self.tickers:
            
            # Download with adjusted close prices and make column names lower
            df = yf.download(ticker, start=start_date, end=end_date, progress=False, auto_adjust=True)
            
            if isinstance(df.columns, pd.MultiIndex): 
                df.columns = [col[0].lower() for col in df.columns]
            else:
                df.columns = [col.lower() for col in df.columns]
            
            df.index.names = [name.lower() if name else 'date' for name in df.index.names]
            
            df = df[['open', 'high', 'low', 'close', 'volume']] # Keep only essential columns
            
            df['ticker'] = ticker # Add ticker column and set multi-index, organize data
            df = df.reset_index()
            df.set_index(['ticker', 'date'], inplace=True)
            
            all_data.append(df)
        
        self.data = pd.concat(all_data, axis=0).sort_index()
    
    def _clean_data(self):
        if self.data is None:
            raise ValueError("No data to clean!")
        
        # 1. Fill small gaps (forward fill then backward fill)
        self.data = self.data.groupby(level=0, group_keys=False).apply(lambda x: x.ffill().bfill())
        
        # 2. Drop any remaining NaN (usually at the beginning)
        self.data = self.data.dropna()
        
        # 3. Basic validation checks
        issues = []
        # No negative prices
        price_cols = ['open', 'high', 'low', 'close']
        if (self.data[price_cols] < 0).any().any():
            issues.append("Negative prices")
        # High >= Low
        if (self.data['high'] < self.data['low']).any():
            issues.append("High < Low")
        # Volume positive
        if (self.data['volume'] <= 0).any():
            issues.append("Zero/Negative volume")
        
        if issues:
            print("issues found")
        else:
            print("quality passed")
    
    def _validate_data(self):
        # Check date alignment
        date_counts = {}
        for ticker in self.data.index.get_level_values(0).unique():
            dates = self.data.xs(ticker, level=0).index
            date_counts[ticker] = len(dates)
        
        if len(set(date_counts.values())) == 1:
            print(f"All tickers have {list(date_counts.values())[0]} trading days")
        else:
            print("tickers have different dates")
        
        # Quick statistical check
        sample_ticker = self.data.index.get_level_values(0).unique()[0]
        print(f"\nðŸ“Š Sample statistics ({sample_ticker} close prices):")
        sample_close = self.data.xs(sample_ticker, level=0)['close']
        print(f"    â€¢ Mean: ${sample_close.mean():.2f}")
        print(f"    â€¢ Max:  ${sample_close.max():.2f}")

# Create an instance and run the pipeline
preprocessor = DataPreprocessor(tickers)
data = preprocessor.run_pipeline(start_date='2015-01-01', end_date='2024-12-31')

# Preview the data
print(data.shape)
print(data.index.get_level_values('ticker').unique().tolist())
print(data.index.get_level_values('date').min(), "to", data.index.get_level_values('date').max())
data.head(10)

quality passed
All tickers have 2515 trading days

ðŸ“Š Sample statistics (AAPL close prices):
    â€¢ Mean: $93.80
    â€¢ Max:  $257.85
(15090, 5)
['AAPL', 'AMZN', 'GOOGL', 'META', 'NFLX', 'SPY']
2015-01-02 00:00:00 to 2024-12-30 00:00:00


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2015-01-02,24.694237,24.705322,23.798602,24.237553,212818400
AAPL,2015-01-05,24.006988,24.086797,23.368517,23.554737,257142000
AAPL,2015-01-06,23.619031,23.816336,23.195599,23.556957,263188400
AAPL,2015-01-07,23.765343,23.987034,23.654497,23.887274,160423600
AAPL,2015-01-08,24.215387,24.862726,24.097889,24.805086,237458000
AAPL,2015-01-09,24.977994,25.106576,24.432633,24.831678,214798000
AAPL,2015-01-12,24.962483,24.969133,24.120056,24.219816,198603200
AAPL,2015-01-13,24.703103,25.006821,24.144441,24.434856,268367600
AAPL,2015-01-14,24.173257,24.49471,24.053544,24.341743,195826400
AAPL,2015-01-15,24.386088,24.399389,23.645639,23.681108,240056000


In [17]:
class FeatureEngineer:
    def __init__(self, data):
        self.data = data.copy()
        self.finished_features = []
        
    def run_pipeline(self):
        self.core_price_features()
        self.math_rule_features()
        self.momentum_features()
        self.volatility_features()
        self.volume_features()
        self.derived_features()
        self.time_features()
        self.lagged_features()
        self.target_variable()
        return self.data
    
    def _apply_by_ticker(self, func): #splits data by ticker for feature calculation and reapplies
        return self.data.groupby(level='ticker', group_keys=False).apply(func)
    
    #create core price features for both models
    def core_price_features(self):
        def calc(df):
            close, high, low, open_ = df['close'], df['high'], df['low'], df['open']
            
            # Returns
            df['log_return'] = np.log(close / close.shift(1))
            df['overnight_return'] = np.log(open_ / close.shift(1))
            df['intraday_return'] = np.log(close / open_)
            
            # Volatility
            df['volatility_20d'] = df['log_return'].rolling(20).std() * np.sqrt(252)
            df['atr_14'] = talib.ATR(high, low, close, timeperiod=14)
            
            # SMAs and ratios
            sma10 = talib.SMA(close, timeperiod=10)
            sma20 = talib.SMA(close, timeperiod=20)
            sma50 = talib.SMA(close, timeperiod=50)
            
            df['price_sma20_ratio'] = close / sma20
            df['price_sma50_ratio'] = close / sma50
            df['sma10_sma20_ratio'] = sma10 / sma20
            return df
        
        self.data = self._apply_by_ticker(calc)
        self.finished_features += ['log_return', 'overnight_return', 'intraday_return', 
                            'volatility_20d', 'atr_14', 'price_sma20_ratio', 
                            'price_sma50_ratio', 'sma10_sma20_ratio']
    
    # Rule-Based Binary Features 
    
    def math_rule_features(self):
        """15 binary features for rule-based model"""
        def calc(df):
            close, high, low, volume = df['close'], df['high'], df['low'], df['volume']
            
            # SMAs
            sma10 = talib.SMA(close, timeperiod=10)
            sma20 = talib.SMA(close, timeperiod=20)
            sma50 = talib.SMA(close, timeperiod=50)
            sma200 = talib.SMA(close, timeperiod=200)
            
            # Trends
            df['golden_cross'] = (sma50 > sma200).astype(int)
            df['short_uptrend'] = (sma10 > sma20).astype(int)
            df['price_above_sma20'] = (close > sma20).astype(int)
            df['price_above_sma50'] = (close > sma50).astype(int)
            
            # Momentum
            rsi = talib.RSI(close, timeperiod=14)
            macd, signal, _ = talib.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
            stoch_k, _ = talib.STOCH(high, low, close, fastk_period=14, slowk_period=3, slowd_period=3)
            roc = talib.ROC(close, timeperiod=10)
            
            df['rsi_oversold'] = (rsi < 30).astype(int)
            df['rsi_overbought'] = (rsi > 70).astype(int)
            df['macd_bullish'] = (macd > signal).astype(int)
            df['roc_positive'] = (roc > 0).astype(int)
            df['stoch_oversold'] = (stoch_k < 20).astype(int)
            
            # Volatility/Reversion 
            upper, _, lower = talib.BBANDS(close, timeperiod=20, nbdevup=2, nbdevdn=2)
            bb_pos = (close - lower) / (upper - lower)
            vol_20d = df['log_return'].rolling(20).std() * np.sqrt(252)
            vol_75pct = vol_20d.expanding().quantile(0.75)
            
            df['bb_oversold'] = (bb_pos < 0.2).astype(int)
            df['bb_overbought'] = (bb_pos > 0.8).astype(int)
            df['high_volatility'] = (vol_20d > vol_75pct).astype(int)
            
            # Volume 
            vol_sma20 = talib.SMA(volume, timeperiod=20)
            vol_ratio = volume / vol_sma20
            price_up = close > close.shift(1)
            price_down = close < close.shift(1)
            
            df['volume_spike'] = (vol_ratio > 1.5).astype(int)
            df['volume_confirmation'] = (price_up & (vol_ratio > 1)).astype(int)
            df['volume_divergence'] = (price_down & (vol_ratio > 1)).astype(int)
            return df
        
        self.data = self._apply_by_ticker(calc)
        self.finished_features += ['golden_cross', 'short_uptrend', 'price_above_sma20', 'price_above_sma50',
                            'rsi_oversold', 'rsi_overbought', 'macd_bullish', 'roc_positive', 'stoch_oversold',
                            'bb_oversold', 'bb_overbought', 'high_volatility',
                            'volume_spike', 'volume_confirmation', 'volume_divergence']
    
    # XGBoost features

    def momentum_features(self):
        """5 momentum oscillators"""
        def calc(df):
            close, high, low = df['close'], df['high'], df['low']
            
            df['rsi_14'] = talib.RSI(close, timeperiod=14)
            macd, signal, hist = talib.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
            df['macd_histogram'] = hist
            df['stoch_k'], _ = talib.STOCH(high, low, close, fastk_period=14, slowk_period=3, slowd_period=3)
            df['williams_r'] = talib.WILLR(high, low, close, timeperiod=14)
            df['roc_10'] = talib.ROC(close, timeperiod=10)
            return df
        
        self.data = self._apply_by_ticker(calc)
        self.finished_features += ['rsi_14', 'macd_histogram', 'stoch_k', 'williams_r', 'roc_10']
    
    def volatility_features(self):
        """4 volatility & range features"""
        def calc(df):
            close, high, low = df['close'], df['high'], df['low']
            
            # Bollinger Bands
            upper, middle, lower = talib.BBANDS(close, timeperiod=20, nbdevup=2, nbdevdn=2)
            df['bb_position'] = (close - lower) / (upper - lower)
            df['bb_width'] = (upper - lower) / middle
            
            # Parkinson volatility (high-low based)
            df['parkinson_vol'] = np.sqrt((1 / (4 * np.log(2))) * (np.log(high / low) ** 2)).rolling(20).mean()
            
            # Short-term volatility
            df['returns_std_5d'] = df['log_return'].rolling(5).std()
            return df
        
        self.data = self._apply_by_ticker(calc)
        self.finished_features += ['bb_position', 'bb_width', 'parkinson_vol', 'returns_std_5d']
    
    def volume_features(self):
        """3 volume indicators"""
        def calc(df):
            close, volume = df['close'], df['volume']
            
            vol_sma20 = talib.SMA(volume, timeperiod=20)
            df['volume_ratio'] = volume / vol_sma20
            df['obv'] = talib.OBV(close, volume)
            df['volume_zscore'] = (volume - volume.rolling(20).mean()) / volume.rolling(20).std()
            return df
        
        self.data = self._apply_by_ticker(calc)
        self.finished_features += ['volume_ratio', 'obv', 'volume_zscore']
    
    
    # Derived and time features 
    
    def derived_features(self):
        """6 derived interaction features"""
        def calc(df):
            df['rsi_slope_3d'] = df['rsi_14'].diff(3)
            df['macd_slope_3d'] = df['macd_histogram'].diff(3)
            df['rsi_volume_interaction'] = df['rsi_14'] * df['volume_ratio']
            df['momentum_vol_interaction'] = df['macd_histogram'] * df['atr_14']
            df['returns_skew_20d'] = df['log_return'].rolling(20).skew()
            df['returns_kurt_20d'] = df['log_return'].rolling(20).kurt()
            return df
        
        self.data = self._apply_by_ticker(calc)
        self.finished_features += ['rsi_slope_3d', 'macd_slope_3d', 'rsi_volume_interaction','momentum_vol_interaction', 'returns_skew_20d', 'returns_kurt_20d']
    
    def time_features(self):
        """3 time-based features"""
        dates = self.data.index.get_level_values('date')
        self.data['day_of_week'] = dates.dayofweek
        self.data['month'] = dates.month
        self.data['is_month_end'] = dates.is_month_end.astype(int)
        self.finished_features += ['day_of_week', 'month', 'is_month_end']
    
    def lagged_features(self):
        """6 lagged features (1-day lag to avoid look-ahead bias)"""
        lag_cols = ['log_return', 'rsi_14', 'volume_ratio', 'macd_histogram', 'bb_position', 'atr_14']
        
        def calc(df):
            for col in lag_cols:
                df[f'{col}_lag1'] = df[col].shift(1)
            return df
        
        self.data = self._apply_by_ticker(calc)
        self.finished_features += [f'{col}_lag1' for col in lag_cols]
    
    def target_variable(self):
        """Target: next day's return (shifted to avoid look-ahead bias)"""
        def calc(df):
            df['target'] = df['log_return'].shift(-1)  # Predict tomorrow's return
            return df
        self.data = self._apply_by_ticker(calc)

# Run the feature engineering pipeline
fe = FeatureEngineer(data)
features_data = fe.run_pipeline()

# Preview
print(f"Final shape: {features_data.shape}")
features_data.head(3)

Final shape: (15090, 56)


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,log_return,overnight_return,intraday_return,volatility_20d,atr_14,...,day_of_week,month,is_month_end,log_return_lag1,rsi_14_lag1,volume_ratio_lag1,macd_histogram_lag1,bb_position_lag1,atr_14_lag1,target
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAPL,2015-01-02,24.694237,24.705322,23.798602,24.237553,212818400,,,-0.018667,,,...,4,1,0,,,,,,,-0.028576
AAPL,2015-01-05,24.006988,24.086797,23.368517,23.554737,257142000,-0.028576,-0.009558,-0.019018,,,...,0,1,0,,,,,,,9.4e-05
AAPL,2015-01-06,23.619031,23.816336,23.195599,23.556957,263188400,9.4e-05,0.002726,-0.002632,,,...,1,1,0,-0.028576,,,,,,0.013925
