
# Algorithmic Trading Machine Learning Project

This is a machine learning project that will take in S&P 500 stock price data and use an unsupervised learning trading strategy. This is strictly for educational purposes and is to help me learn machine learning.

1. Install packages.

In [33]:
# Packages installed:
# pandas
# pandas_ta
# numpy
# matplotlib
# statsmodels
# pandas_datareader
# datetime
# yfinance
# sklearn
# PyPortfolioOpt

!pip install pandas pandas_ta numpy matplotlib statsmodels pandas_datareader datetime yfinance sklearn PyPortfolioOpt -q


You should consider upgrading via the 'c:\users\kyle\onedrive\desktop\project folders\algoirthmic trading\my_virtual_env\scripts\python.exe -m pip install --upgrade pip' command.


2. Download S&P 500 data

In [3]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
warnings.filterwarnings('ignore')

# Read the S&P 500 list of companies
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

# Remove the dot from the symbol names
sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

# Get the list of symbols
symbols_list = sp500['Symbol'].unique().tolist()
symbols_list

# Define the start and end dates
end_date = '2023-09-27'
start_date = pd.to_datetime(end_date) - pd.DateOffset(365 * 8)

# Download the data
df = yf.download(tickers = symbols_list, 
                 start=start_date, 
                 end=end_date).stack()

df.index.names = ['date', 'ticker']
df.columns = df.columns.str.lower()


[*********************100%%**********************]  503 of 503 completed

3 Failed downloads:
['GEV', 'SOLV', 'VLTO']: YFChartError("%ticker%: Data doesn't exist for startDate = 1443499200, endDate = 1695787200")


2. Technical indicators for all the stocks

In [38]:
# Garman-Klass volatility
df['garman_klass_vol'] = ((np.log(df['high']) - np.log(df['low'])) **2/2 - ((2 * np.log(2) - 1) * (np.log(df['adj close']) - np.log(df['open'])) ** 2))

# RSI
df['rsi'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.rsi(close = x, length = 20))

# Bolinger Bands
df['bb_low'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:, 0])
df['bb_med'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:, 1])
df['bb_high'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:, 2])

# ATR
def compute_atr(stock_data):
    atr = pandas_ta.atr(high = stock_data['high'],
                        low = stock_data['low'],
                        close = stock_data['adj close'],
                        length = 14)
    return atr.sub(atr.mean()).div(atr.std())
df['atr'] = df.groupby(level = 1, group_keys = False).apply(compute_atr)

# MACD
def compute_macd(close):
    macd = pandas_ta.macd(close = close, length = 20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level = 1, group_keys = False)['adj close'].apply(compute_macd)

# Volume
df['dollar_vol'] = (df['adj close'] * df['volume']) / 1e6
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_med,bb_high,atr,macd,dollar_vol
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-09-29,A,31.425234,33.740002,34.060001,33.240002,33.360001,2252400.0,-0.001082,,,,,,,70.782197
2015-09-29,AAL,37.361626,39.180000,39.770000,38.790001,39.049999,7478800.0,-0.000443,,,,,,,279.420126
2015-09-29,AAPL,24.651134,27.264999,28.377501,26.965000,28.207500,293461600.0,-0.005712,,,,,,,7234.161370
2015-09-29,ABBV,36.334900,52.790001,54.189999,51.880001,53.099998,12842800.0,-0.054655,,,,,,,466.641852
2015-09-29,ABT,33.478706,39.500000,40.150002,39.029999,39.259998,12287500.0,-0.009402,,,,,,,411.369604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,XYL,88.736298,89.519997,90.849998,89.500000,90.379997,1322400.0,-0.000018,26.146726,4.485761,4.567684,4.649607,-2.967043,-2.159189,117.344880
2023-09-26,YUM,122.211014,124.010002,124.739998,123.449997,124.239998,1500600.0,-0.000051,36.057182,4.811707,4.841672,4.871637,-2.813230,-1.363695,183.389847
2023-09-26,ZBH,111.534813,112.459999,117.110001,112.419998,116.769997,3610500.0,0.000022,31.893257,4.745884,4.785551,4.825217,-2.109951,-0.881067,402.696442
2023-09-26,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0,0.000133,29.494977,5.400991,5.539167,5.677342,-0.057389,-1.600791,79.595386


3. Filter top 150 most liquid

In [39]:
# Define the list of columns to keep
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_vol', 'volume', 'open', 'high', 'low', 'close']]

data = pd.concat([df.unstack('ticker')['dollar_vol'].resample('M').mean().stack('ticker').to_frame('dollar_vol'),
df.unstack()[last_cols].resample('M').last().stack('ticker')], axis = 1).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_vol,adj close,atr,bb_high,bb_low,bb_med,garman_klass_vol,macd,rsi
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-11-30,A,135.740924,38.950893,-0.948803,3.689080,3.544191,3.616636,-0.002098,0.567157,73.421442
2015-11-30,AAL,287.915799,39.429932,1.473834,3.827636,3.672028,3.749832,-0.000966,-0.418772,40.718982
2015-11-30,AAPL,4023.984065,26.854136,-0.326058,3.368302,3.281679,3.324991,-0.003307,-0.142790,55.537338
2015-11-30,ABBV,337.563924,40.393532,0.572009,3.823191,3.726695,3.774943,-0.059549,0.145677,49.376875
2015-11-30,ABT,211.659056,38.293579,0.553864,3.699763,3.656056,3.677909,-0.011216,0.335558,56.962682
...,...,...,...,...,...,...,...,...,...,...
2023-09-30,OTIS,154.361757,78.356499,-2.269285,4.460712,4.370137,4.415425,-0.000097,-1.534536,33.116206
2023-09-30,ABNB,1633.500725,132.279999,-1.006939,5.024801,4.857047,4.940924,0.000213,-0.037854,44.494127
2023-09-30,CEG,196.670369,107.862030,-0.705546,4.732493,4.652147,4.692320,0.000131,0.366876,55.245482
2023-09-30,GEHC,212.275849,66.130219,-0.905988,4.270508,4.155436,4.212972,0.000185,-1.116463,40.922342


In [40]:
# Rolling average 5 year
data['dollar_vol'] = (data.loc[:,'dollar_vol'].unstack('ticker').rolling(5*12, min_periods = 12).mean().stack())

# Rank the stocks by dollar volume
data['dollar_vol_rank'] = (data.groupby('date')['dollar_vol'].rank(ascending=False))

# Keep only the top 150 stocks by dollar volume
data = data[data['dollar_vol_rank']<150].drop(['dollar_vol', 'dollar_vol_rank'], axis=1)

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,atr,bb_high,bb_low,bb_med,garman_klass_vol,macd,rsi
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-10-31,AAL,39.134327,1.161163,3.706314,3.604673,3.655493,-0.000176,1.131596,62.203520
2016-10-31,AAPL,26.212479,-0.963372,3.351997,3.294237,3.323117,-0.002468,-0.195978,49.891144
2016-10-31,ABBV,40.245129,0.257528,3.861774,3.753446,3.807610,-0.046691,-0.760593,27.477778
2016-10-31,ABT,34.293446,-0.483970,3.655580,3.554632,3.605106,-0.007492,-0.650889,38.008738
2016-10-31,ACN,103.569633,0.222961,4.660617,4.637342,4.648980,-0.004643,-0.135456,53.823696
...,...,...,...,...,...,...,...,...,...
2023-09-30,XOM,113.372101,-1.345888,4.767283,4.687091,4.727187,-0.000065,1.400623,59.440183
2023-09-30,MRNA,98.120003,-0.529511,4.788149,4.582514,4.685332,0.000146,-0.376899,38.747314
2023-09-30,UBER,44.270000,-0.746098,3.917801,3.806654,3.862227,0.000441,-0.133973,45.005268
2023-09-30,CRWD,160.479996,-0.744862,5.181204,5.026187,5.103696,0.000144,0.245950,51.534803


4. Calculate monthly returns for different times


In [41]:
# Calculate the returns function
def calculate_returns(df):
    outlier_cutoff = 0.005
    lags = [1,2,3,6,9,12]

    for lag in lags:
        df[f'return_{lag}m'] = df['adj close'].pct_change(lag).pipe(lambda x: x.clip(lower = x.quantile(outlier_cutoff), upper = x.quantile(1-outlier_cutoff))).add(1).pow(1/lag).sub(1)
    return df
data = data.groupby(level = 1, group_keys = False).apply(calculate_returns).dropna()
data



Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,atr,bb_high,bb_low,bb_med,garman_klass_vol,macd,rsi,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-10-31,AAL,45.534172,1.125078,3.994389,3.849110,3.921750,-0.000363,-0.018697,41.051800,-0.014108,0.022981,-0.023860,0.016495,0.007008,0.012702
2017-10-31,AAPL,39.713886,-0.532947,3.688475,3.594730,3.641602,-0.001055,-0.039276,69.196647,0.096807,0.015249,0.044955,0.028875,0.038941,0.035228
2017-10-31,ABBV,67.491165,1.577038,4.289424,4.196702,4.243063,-0.034008,0.473815,55.247892,0.022728,0.098590,0.091379,0.056495,0.047273,0.044026
2017-10-31,ABT,48.493378,-0.019443,3.939707,3.892568,3.916137,-0.005190,0.276132,53.844920,0.021276,0.034308,0.034801,0.038672,0.031320,0.029294
2017-10-31,ACN,129.399750,0.250124,4.882034,4.802675,4.842354,-0.003925,0.352344,69.365340,0.064180,0.048455,0.037203,0.028692,0.027398,0.018728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30,XOM,113.372101,-1.345888,4.767283,4.687091,4.727187,-0.000065,1.400623,59.440183,0.046947,0.046139,0.030496,0.012838,0.008747,0.027037
2023-09-30,MRNA,98.120003,-0.529511,4.788149,4.582514,4.685332,0.000146,-0.376899,38.747314,-0.132219,-0.086803,-0.068763,-0.071952,-0.064976,-0.015431
2023-09-30,UBER,44.270000,-0.746098,3.917801,3.806654,3.862227,0.000441,-0.133973,45.005268,-0.062672,-0.053920,0.008422,0.057244,0.066838,0.043691
2023-09-30,CRWD,160.479996,-0.744862,5.181204,5.026187,5.103696,0.000144,0.245950,51.534803,-0.015641,-0.003656,0.029981,0.026391,0.047942,-0.002216


5. Download Fama-French factors and calculate rolling factor betas.


In [54]:
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3',
               'famafrench',
               start = '2010',)[0].drop('RF', axis = 1)

factor_data.index = factor_data.index.to_timestamp()
factor_data.index.name = 'date'
factor_data = factor_data.resample('M').last().div(100)

factor_data = factor_data.join(data['return_1m']).sort_index()
factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,AAL,0.0225,-0.0194,0.0020,0.0093,-0.0325,-0.014108
2017-10-31,AAPL,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.096807
2017-10-31,ABBV,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.022728
2017-10-31,ABT,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.021276
2017-10-31,ACN,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.064180
...,...,...,...,...,...,...,...
2023-09-30,VRTX,-0.0524,-0.0181,0.0151,0.0187,-0.0082,0.009617
2023-09-30,VZ,-0.0524,-0.0181,0.0151,0.0187,-0.0082,-0.056890
2023-09-30,WFC,-0.0524,-0.0181,0.0151,0.0187,-0.0082,-0.015500
2023-09-30,WMT,-0.0524,-0.0181,0.0151,0.0187,-0.0082,-0.000676


In [55]:
# Filter stocks without 10 months of data

observations = factor_data.groupby(level = 1).size()
valid_stocks = observations[observations >= 10]
factor_data = factor_data[factor_data.index.get_level_values('ticker').isin(valid_stocks.index)]
factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,AAL,0.0225,-0.0194,0.0020,0.0093,-0.0325,-0.014108
2017-10-31,AAPL,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.096807
2017-10-31,ABBV,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.022728
2017-10-31,ABT,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.021276
2017-10-31,ACN,0.0225,-0.0194,0.0020,0.0093,-0.0325,0.064180
...,...,...,...,...,...,...,...
2023-09-30,VRTX,-0.0524,-0.0181,0.0151,0.0187,-0.0082,0.009617
2023-09-30,VZ,-0.0524,-0.0181,0.0151,0.0187,-0.0082,-0.056890
2023-09-30,WFC,-0.0524,-0.0181,0.0151,0.0187,-0.0082,-0.015500
2023-09-30,WMT,-0.0524,-0.0181,0.0151,0.0187,-0.0082,-0.000676


In [59]:
betas = factor_data.groupby(level = 1,
                    group_keys = False).apply(lambda x: RollingOLS(endog = x['return_1m'], exog = sm.add_constant( x.drop('return_1m', axis = 1)),
                                              window = min(24, x.shape[0]),
                                              min_nobs = len(x.columns) + 1)
.fit()
.params
.drop('const', axis = 1))

betas

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-10-31,AAL,,,,,
2017-10-31,AAPL,,,,,
2017-10-31,ABBV,,,,,
2017-10-31,ABT,,,,,
2017-10-31,ACN,,,,,
...,...,...,...,...,...,...
2023-09-30,VRTX,0.456835,-0.444629,-0.314191,-0.077989,0.802008
2023-09-30,VZ,0.332723,-0.166037,0.265927,0.311103,0.108625
2023-09-30,WFC,1.120621,0.297484,2.062607,-0.441340,-1.519517
2023-09-30,WMT,0.700774,-0.313570,-0.413679,-0.141573,0.508836
