## Imports

In [99]:
import numpy as np
import pandas as pd
import yfinance as yf
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time

from scipy.stats import spearmanr
from talib import RSI, BBANDS, MACD, ATR

In [100]:
tickers = ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'META']
tickers = [ticker.replace('.', '-') for ticker in tickers]  # Replace dots with hyphens for yfinance compatibility
print(tickers)

['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'META']


In [101]:
start_date = '2015-01-01'
end_date = '2025-01-01'

The following method is supposed to download data for each ticket.

In [102]:
def get_data(ticker, start_date, end_date):
    data = yf.download(ticker, start_date, end_date)
    data["Ticker"] = ticker
    return data

In [103]:
os.makedirs("data", exist_ok=True)

print(f"Processing data for {len(tickers)} stocks...")
for ticker in tickers:
    try:
        # Get enhanced data for this stock
        stock_data = get_data(ticker, start_date, end_date)
        
        # Reset index to make Date a column
        stock_data = stock_data.reset_index()

        print(f"Processed {ticker} data with {len(stock_data)} rows.")
        
        # Save individual stock data
        stock_data.to_csv(f"data/{ticker}.csv", index=False)
        
        # Add a small delay to avoid hitting API limits
        time.sleep(2)

    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")

  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed

Processing data for 6 stocks...
Processed AAPL data with 2516 rows.



  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed MSFT data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed NVDA data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed AMZN data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed GOOGL data with 2516 rows.


  data = yf.download(ticker, start_date, end_date)
[*********************100%***********************]  1 of 1 completed


Processed META data with 2516 rows.


Let us analyze how does the dataframe for one stock look like.

In [104]:
files = os.listdir('data') # List of files in the data directory
files

['MSFT.csv', 'AMZN.csv', 'GOOGL.csv', 'NVDA.csv', 'META.csv', 'AAPL.csv']

In [105]:
example_file = 'MSFT.csv'
example_file_path = os.path.join('data', example_file)

In [106]:
msft_data = pd.read_csv(example_file_path)
msft_data.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume,Ticker
0,,MSFT,MSFT,MSFT,MSFT,MSFT,
1,2015-01-02,39.99870681762695,40.56327357006327,39.81052007555399,39.913167686058216,27913900,MSFT
2,2015-01-05,39.630882263183594,39.973042020810816,39.56244835379311,39.665095954770436,39673900,MSFT
3,2015-01-06,39.0492057800293,39.9901491609254,38.95511078931803,39.67365049834883,36447900,MSFT
4,2015-01-07,39.54533767700195,39.74208026089452,38.91234036588819,39.33148661671331,29114100,MSFT


As we see, there is a redundant second row(row with index 0), and moreover there is redundant indexes in the dataframe. We want to get rid of them and concatenate dataframes.  

In [107]:
dataframes = []
for file in files:
    file_path = os.path.join('data', file)     
    stock_data = pd.read_csv(file_path)
    stock_data.drop([0], inplace=True) # Drop row with index 0
    dataframes.append(stock_data)
if dataframes:
    data = pd.concat(dataframes, ignore_index=True)

In [108]:
data

Unnamed: 0,Date,Close,High,Low,Open,Volume,Ticker
0,2015-01-02,39.99870681762695,40.56327357006327,39.81052007555399,39.913167686058216,27913900,MSFT
1,2015-01-05,39.630882263183594,39.973042020810816,39.56244835379311,39.665095954770436,39673900,MSFT
2,2015-01-06,39.0492057800293,39.9901491609254,38.95511078931803,39.67365049834883,36447900,MSFT
3,2015-01-07,39.54533767700195,39.74208026089452,38.91234036588819,39.33148661671331,29114100,MSFT
4,2015-01-08,40.70869445800781,40.845559027891056,39.964493563211974,39.9901546503436,29645200,MSFT
...,...,...,...,...,...,...,...
15091,2024-12-24,257.57867431640625,257.58862955018265,254.67565812568793,254.87518901917565,23234700,AAPL
15092,2024-12-26,258.39666748046875,259.47408555117545,257.0100282923795,257.56867823862046,27237100,AAPL
15093,2024-12-27,254.9749298095703,258.0774615569102,252.4510193654514,257.20952960207876,42355300,AAPL
15094,2024-12-30,251.5930938720703,252.8899685500744,250.14658624824915,251.62302046123375,35557500,AAPL


In [109]:
data.dtypes

Date      object
Close     object
High      object
Low       object
Open      object
Volume    object
Ticker    object
dtype: object

The columns should be converted into appropriate datatype. 

In [110]:
data['Date'] = pd.to_datetime(data['Date'])
numeric_columns = ['Close', 'High', 'Low', 'Open', 'Volume']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [111]:
data.dtypes

Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
Ticker            object
dtype: object

In [112]:
data.columns = data.columns.map(lambda x: x.lower())

#### Compute Rolling Average Dollar Volume

In [113]:
data['dollar_vol'] = data[['close', 'volume']].prod(axis=1)

In [114]:
data

Unnamed: 0,date,close,high,low,open,volume,ticker,dollar_vol
0,2015-01-02,39.998707,40.563274,39.810520,39.913168,27913900,MSFT,1.116520e+09
1,2015-01-05,39.630882,39.973042,39.562448,39.665096,39673900,MSFT,1.572312e+09
2,2015-01-06,39.049206,39.990149,38.955111,39.673650,36447900,MSFT,1.423262e+09
3,2015-01-07,39.545338,39.742080,38.912340,39.331487,29114100,MSFT,1.151327e+09
4,2015-01-08,40.708694,40.845559,39.964494,39.990155,29645200,MSFT,1.206817e+09
...,...,...,...,...,...,...,...,...
15091,2024-12-24,257.578674,257.588630,254.675658,254.875189,23234700,AAPL,5.984763e+09
15092,2024-12-26,258.396667,259.474086,257.010028,257.568678,27237100,AAPL,7.037976e+09
15093,2024-12-27,254.974930,258.077462,252.451019,257.209530,42355300,AAPL,1.079954e+10
15094,2024-12-30,251.593094,252.889969,250.146586,251.623020,35557500,AAPL,8.946021e+09


In [115]:
data['dollar_vol_1m'] = (data.groupby('ticker')['dollar_vol']
                           .rolling(window=21)
                           .mean()).values

In [116]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15096 entries, 0 to 15095
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           15096 non-null  datetime64[ns]
 1   close          15096 non-null  float64       
 2   high           15096 non-null  float64       
 3   low            15096 non-null  float64       
 4   open           15096 non-null  float64       
 5   volume         15096 non-null  int64         
 6   ticker         15096 non-null  object        
 7   dollar_vol     15096 non-null  float64       
 8   dollar_vol_1m  14976 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 1.0+ MB


In [117]:
data['dollar_vol_rank'] = (data.groupby('date')
                             .dollar_vol_1m
                             .rank(ascending=False))

In [118]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15096 entries, 0 to 15095
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             15096 non-null  datetime64[ns]
 1   close            15096 non-null  float64       
 2   high             15096 non-null  float64       
 3   low              15096 non-null  float64       
 4   open             15096 non-null  float64       
 5   volume           15096 non-null  int64         
 6   ticker           15096 non-null  object        
 7   dollar_vol       15096 non-null  float64       
 8   dollar_vol_1m    14976 non-null  float64       
 9   dollar_vol_rank  14976 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 1.2+ MB


### Add some Basic Factors

### Compute the Relative Strength Index

In [119]:
data['rsi'] = data.groupby('ticker')['close'].transform(RSI)

### Compute Bollinger Bands

In [120]:
def compute_bb(close_prices):
    """
    Compute Bollinger Bands for a single ticker's close prices
    Returns a DataFrame with bb_high and bb_low columns
    """
    high, mid, low = BBANDS(close_prices, timeperiod=20)
    return pd.DataFrame({
        'bb_high': high, 
        'bb_low': low
    }, index=close_prices.index)

In [121]:
bb_results = data.groupby('ticker')['close'].apply(compute_bb)
bb_results = bb_results.reset_index(level=0, drop=True) # Reset index to align with original data
data = data.join(bb_results)

In [122]:
data['bb_high'] = data['bb_high'].sub(data['close']).div(data['bb_high']).apply(np.log1p)
data['bb_low'] = data['close'].sub(data['bb_low']).div(data['close']).apply(np.log1p)

### Compute Average True Range

In [123]:
def compute_atr(stock_data):
    """
    Compute normalized ATR for a single ticker
    """
    try:
        df = ATR(stock_data['high'], stock_data['low'], 
                 stock_data['close'], timeperiod=14)
        return df.sub(df.mean()).div(df.std())
    except:
        return pd.Series(np.nan, index=stock_data.index)


In [124]:
data['atr'] = data.groupby('ticker', group_keys=False).apply(compute_atr, include_groups=False)

### Compute Moving Average
### Convergance/Divergance

In [125]:
def compute_macd(close_prices):
    """
    Compute normalized MACD for a single ticker
    """
    try:
        macd = MACD(close_prices)[0]  # Get only the MACD line
        return (macd - np.mean(macd)) / np.std(macd)
    except:
        return pd.Series(np.nan, index=close_prices.index)


In [126]:
data['macd'] = data.groupby('ticker')['close'].apply(compute_macd).reset_index(level=0, drop=True)

### Compute Lagged Returns

In [127]:
lags = [1, 5, 10, 21, 42, 63]

In [128]:
returns = data.groupby('ticker')['close'].pct_change()
percentiles=[.0001, .001, .01]
percentiles+= [1-p for p in percentiles]
returns.describe(percentiles=percentiles).iloc[2:].to_frame('percentiles').style.format(lambda x: f'{x:,.2%}')

Unnamed: 0,percentiles
std,2.18%
min,-26.39%
0.01%,-21.71%
0.1%,-9.95%
1%,-5.74%
50%,0.12%
99%,6.41%
99.9%,13.92%
99.99%,23.82%
max,29.81%


In [129]:
q = 0.0001

### Winsorize outliers

In [130]:
for lag in lags:
    data[f'return_{lag}d'] = (data.groupby('ticker')['close']
                                .pct_change(lag)
                                .pipe(lambda x: x.clip(lower=x.quantile(q),
                                                       upper=x.quantile(1 - q)))
                                .add(1)
                                .pow(1 / lag)
                                .sub(1)
                                )

### Shift lagged outliers

In [131]:
for t in [1, 2, 3, 4, 5]:
    for lag in [1, 5, 10, 21]:
        data[f'return_{lag}d_lag{t}'] = (data.groupby('ticker')
                                           [f'return_{lag}d'].shift(t * lag))

### Compute forward returns

In [132]:
for t in [1, 5, 10, 21]:
    data[f'target_{t}d'] = data.groupby('ticker')[f'return_{t}d'].shift(-t)

### Create year and month columns

In [133]:
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month

In [134]:
data = pd.get_dummies(data,
                      columns=['year', 'month'],
                      prefix=['year', 'month'],
                      prefix_sep=['_', '_'],
                      drop_first=True)

In [137]:
print(data.isnull().sum())

date        0
close       0
high        0
low         0
open        0
           ..
month_8     0
month_9     0
month_10    0
month_11    0
month_12    0
Length: 65, dtype: int64
