
# Algorithmic Trading Machine Learning Project

This is a machine learning project that will take in S&P 500 stock price data and use an unsupervised learning trading strategy. This is strictly for educational purposes and is to help me learn machine learning.

1. Install packages.

In [18]:
# Packages installed:
# pandas
# pandas_ta
# numpy
# matplotlib
# statsmodels
# pandas_datareader
# datetime
# yfinance
# sklearn
# PyPortfolioOpt

!pip install pandas pandas_ta numpy matplotlib statsmodels pandas_datareader datetime yfinance sklearn PyPortfolioOpt -q


You should consider upgrading via the 'c:\users\kyle\onedrive\desktop\project folders\algoirthmic trading\my_virtual_env\scripts\python.exe -m pip install --upgrade pip' command.


2. Download S&P 500 data

In [22]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
warnings.filterwarnings('ignore')

# Read the S&P 500 list of companies
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

# Remove the dot from the symbol names
sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

# Get the list of symbols
symbols_list = sp500['Symbol'].unique().tolist()
symbols_list

# Define the start and end dates
end_date = '2023-09-27'
start_date = pd.to_datetime(end_date) - pd.DateOffset(365 * 8)

# Download the data
df = yf.download(tickers = symbols_list, 
                 start=start_date, 
                 end=end_date).stack()

df.index.names = ['Date', 'Ticker']
df.columns = df.columns.str.lower()


[*********************100%%**********************]  503 of 503 completed

3 Failed downloads:
['SOLV', 'GEV', 'VLTO']: YFChartError("%ticker%: Data doesn't exist for startDate = 1443499200, endDate = 1695787200")


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-09-29,A,31.483547,33.740002,34.060001,33.240002,33.360001,2252400.0
2015-09-29,AAL,37.361622,39.180000,39.770000,38.790001,39.049999,7478800.0
2015-09-29,AAPL,24.651134,27.264999,28.377501,26.965000,28.207500,293461600.0
2015-09-29,ABBV,36.334904,52.790001,54.189999,51.880001,53.099998,12842800.0
2015-09-29,ABT,33.478703,39.500000,40.150002,39.029999,39.259998,12287500.0
...,...,...,...,...,...,...,...
2023-09-26,XYL,88.736298,89.519997,90.849998,89.500000,90.379997,1322400.0
2023-09-26,YUM,122.211006,124.010002,124.739998,123.449997,124.239998,1500600.0
2023-09-26,ZBH,111.534821,112.459999,117.110001,112.419998,116.769997,3610500.0
2023-09-26,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0


2. Technical indicators for all the stocks

In [32]:
# Garman-Klass volatility
df['garman_klass_vol'] = ((np.log(df['high']) - np.log(df['low'])) **2/2 - ((2 * np.log(2) - 1) * (np.log(df['adj close']) - np.log(df['open'])) ** 2))

# RSI
df['rsi'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.rsi(close = x, length = 20))

# Bolinger Bands
df['bb_low'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:, 0])
df['bb_med'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:, 1])
df['bb_high'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.bbands(close = np.log1p(x), length = 20).iloc[:, 2])

# ATR
def compute_atr(stock_data):
    atr = pandas_ta.atr(high = stock_data['high'],
                        low = stock_data['low'],
                        close = stock_data['adj close'],
                        length = 14)
    return atr.sub(atr.mean()).div(atr.std())
df['atr'] = df.groupby(level = 1, group_keys = False).apply(compute_atr)

# MACD
def compute_macd(close):
    macd = pandas_ta.macd(close = close, length = 20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level = 1, group_keys = False)['adj close'].apply(compute_macd)

# Volume
df['dollar_vol'] = (df['adj close'] * df['volume']) / 1e6
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_med,bb_high,atr,macd,dollar_vol
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-09-29,A,31.483547,33.740002,34.060001,33.240002,33.360001,2252400.0,-0.000998,,,,,,,70.913542
2015-09-29,AAL,37.361622,39.180000,39.770000,38.790001,39.049999,7478800.0,-0.000443,,,,,,,279.420098
2015-09-29,AAPL,24.651134,27.264999,28.377501,26.965000,28.207500,293461600.0,-0.005712,,,,,,,7234.161370
2015-09-29,ABBV,36.334904,52.790001,54.189999,51.880001,53.099998,12842800.0,-0.054655,,,,,,,466.641901
2015-09-29,ABT,33.478703,39.500000,40.150002,39.029999,39.259998,12287500.0,-0.009402,,,,,,,411.369558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,XYL,88.736298,89.519997,90.849998,89.500000,90.379997,1322400.0,-0.000018,26.146750,4.485761,4.567684,4.649607,-2.967042,-2.159189,117.344880
2023-09-26,YUM,122.211006,124.010002,124.739998,123.449997,124.239998,1500600.0,-0.000051,36.057170,4.811707,4.841672,4.871637,-2.813230,-1.363696,183.389836
2023-09-26,ZBH,111.534821,112.459999,117.110001,112.419998,116.769997,3610500.0,0.000022,31.893251,4.745884,4.785551,4.825217,-2.109951,-0.881067,402.696470
2023-09-26,ZBRA,223.960007,223.960007,226.649994,222.580002,225.970001,355400.0,0.000133,29.494977,5.400991,5.539167,5.677342,-0.057389,-1.600791,79.595386
