In [1]:
import os
import re 
from typing import Optional
import numpy as np
import pandas as pd
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import holidays
import yfinance as yf
from tqdm.notebook import tqdm
from itertools import tee
from datetime import datetime, timedelta
from gp import GPTiny

In [2]:

def GetDatePairs(start,end):
    us_holidays = holidays.financial_holidays('NYSE') 
    daterange = pd.bdate_range(start=start,end=end)
    daterange = [c for c in daterange if c not in us_holidays]
    a1, a2 = tee(daterange)
    next(a2)
    pairs = list((z[0].to_pydatetime(),z[1].to_pydatetime()) for z in zip(a1, a2))
    return pairs

In [3]:
os.makedirs("./data_test", exist_ok=True)
os.makedirs("./dailies_test", exist_ok=True)

In [4]:
B = None
existingfiles = sorted(Path('./dailies_test').iterdir(), key=lambda f: f.stat().st_mtime)
latestfile = [str(f) for f in existingfiles if "top" in (str(f)) ]
x = None#re.search(r'(\d+[-]\d+[-]\d+)', latestfile[-1])

if(x is None):
    B = GetDatePairs(start='2024-11-01',end=(datetime.today()).strftime("%Y-%m-%d"))
    
else:
    B = GetDatePairs(start=x.group(0),end=(datetime.today()).strftime("%Y-%m-%d"))
    
print(f"Start: {B[0][0]}, End:{B[-1][1]}")



Start: 2024-11-01 00:00:00, End:2026-01-23 00:00:00


In [5]:
snp500 = None
filepath = Path("./data/sp500_companies.csv")
if(False==filepath.is_file()):
  headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'}
  html_data=requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies',headers=headers).text
  beautiful_soup=BeautifulSoup(html_data, "html.parser")
  tables = beautiful_soup.find_all('table')
  
  S_P_500_companies = list([])

  for row in tables[1].tbody.find_all("tr"):
      col = row.find_all("td")
      if (col != []):
          Symbol = col[0].text.strip().replace('\n','')
          Security = col[1].text.strip().replace('\n','')
          Sector = col[2].text.strip().replace('\n','')
          Sub_Industry = col[3].text.strip().replace('\n','')
          Headquarters_Location = col[4].text.strip().replace('\n','')
          Date_first = col[5].text.strip().replace('\n','')
          CIK = col[6].text.strip().replace('\n','')
          Founded = col[7].text.strip().replace('\n','')
          S_P_500_companies.append({"Symbol":Symbol, "Security":Security, "Sector":Sector, "Sub-Industry":Sub_Industry,
            "Headquarters":Headquarters_Location,"Date-Added":Date_first,"CIK":CIK,"Founded":Founded})
  snp500 = pd.DataFrame(data=S_P_500_companies)
  snp500.Symbol = snp500.Symbol.str.replace('.','-') 
  snp500.to_csv('./data/sp500_companies.csv',index=False)
else:
   snp500 = pd.read_csv('./data/sp500_companies.csv')

In [6]:
snp500.head()

Unnamed: 0,Symbol,Security,Sector,Sub-Industry,Headquarters,Date-Added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [7]:
def MungeData(sym, spy, start, end):
    try:
        if(spy is None):
            stock = yf.Ticker('^GSPC')
            stockhist = stock.history(start=start,end=end)
            spy = stockhist[['Open', 'High', 'Low', 'Close', 'Volume']].copy().pct_change(fill_method=None)+1

        stock = yf.Ticker(sym)
        stockhist = stock.history(start=start,end=end)
        x = stockhist[['Open', 'High', 'Low', 'Close', 'Volume']].copy().pct_change(fill_method=None)+1
        x /= spy
        x = np.log(x)
        x.insert(0,'Days',(x.index-x.index[0]).days)
        for c in ['Open','Close','High','Low','Volume']:
            x[c+'_Div'] = x[c].diff()/x['Days'].diff()
            x[c+'_Div2'] = x[c+'_Div'].diff()/x['Days'].diff()
            for i in [7,14,21,28]:
                x[c+'_'+str(i)+'_MN'] = x[c].rolling(pd.Timedelta(days=i)).mean()
                x[c+'_'+str(i)+'_SD'] = (x[c].rolling(pd.Timedelta(days=i)).std())

        _ = x.pop('Days')
        x.insert(0,"Sym",sym)
        x['Target'] = x.groupby('Sym').Close.shift(-1).fillna(-999)
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        x.dropna(inplace=True)
        
        return x, spy
    except:
        return None, spy

In [8]:
start = B[0][0].strftime("%Y-%m-%d")
end = B[-1][1].strftime("%Y-%m-%d")
print(start,end)
spy = None
g = []
for c in tqdm(list(snp500.Symbol.values)):

    l,spy = MungeData(c, spy, (datetime.strptime(start,"%Y-%m-%d")-timedelta(weeks=5)).strftime("%Y-%m-%d"), end)#Requred to ensure all  rolling values are populated
    
    if l is not None:
        g.append(l)
  
o= pd.concat(g, axis=0)


2024-11-01 2026-01-23


  0%|          | 0/503 [00:00<?, ?it/s]

In [9]:
gpModel = GPTiny()
for start, end in tqdm(B):
    filepath = Path('./dailies_test/top_close_250_'+start.strftime("%Y-%m-%d")+'_'+end.strftime("%Y-%m-%d")+'.csv')
    if(False==filepath.is_file()):
        #print(start.strftime("%Y-%m-%d"),end.strftime("%Y-%m-%d"))
        mask = (o.index >= start.strftime("%Y-%m-%d")) & (o.index < end.strftime("%Y-%m-%d"))
        x = o[mask].copy()
        #print(x.shape)
        wset = []
        lset = []
        if(x.shape[0]>0):
            #print(x.shape)
            for gp in range(10):
                t = x[['Sym']].copy()
                if(gp==0):
                    t['Target'] = gpModel.GPI(x)
                elif(gp==1):
                    t['Target'] = gpModel.GPII(x)
                elif(gp==2):
                    t['Target'] = gpModel.GPIII(x)
                elif(gp==3):
                    t['Target'] = gpModel.GPIV(x)
                elif(gp==4):
                    t['Target'] = gpModel.GPV(x)
                if(gp==5):
                    t['Target'] = gpModel.GPVI(x)
                elif(gp==6):
                    t['Target'] = gpModel.GPVII(x)
                elif(gp==7):
                    t['Target'] = gpModel.GPVIII(x)
                elif(gp==8):
                    t['Target'] = gpModel.GPIX(x)
                else:
                    t['Target'] = gpModel.GPX(x)

                w = t.reset_index().pivot(index='Date',columns='Sym',values='Target')

                w1 = pd.DataFrame({n: w.T[col].nlargest(250).index.tolist()
                                for n, col in enumerate(w.T)}).T
                l1 = pd.DataFrame({n: w.T[col].nsmallest(250).index.tolist()
                                for n, col in enumerate(w.T)}).T
                wset.append(w1.values[0])
                lset.append(l1.values[0])    

                
            wsubset = pd.DataFrame(data={'Sym':list(set.intersection(*map(set,wset)))})
            lsubset = pd.DataFrame(data={'Sym':list(set.intersection(*map(set,lset)))})

            wstd = {}
            lstd = {}
            for sym in wsubset.Sym.values:
                mask = (o.index >= start.strftime("%Y-%m-%d")) & (o.index < end.strftime("%Y-%m-%d"))
                x = o[mask].copy()
                x = x[x['Sym'] == sym]
                v = [gpModel.GPI(x),gpModel.GPII(x),gpModel.GPIII(x),gpModel.GPIV(x),gpModel.GPV(x),
                     gpModel.GPVI(x),gpModel.GPVII(x),gpModel.GPVIII(x),gpModel.GPIX(x),gpModel.GPX(x)]
                wstd[sym] = [np.std(v),np.min(v),np.mean(v),np.max(v)]
            for sym in lsubset.Sym.values:
                mask = (o.index >= start.strftime("%Y-%m-%d")) & (o.index < end.strftime("%Y-%m-%d"))
                x = o[mask].copy()
                x = x[x['Sym'] == sym]
                v = [gpModel.GPI(x),gpModel.GPII(x),gpModel.GPIII(x),gpModel.GPIV(x),gpModel.GPV(x),
                     gpModel.GPVI(x),gpModel.GPVII(x),gpModel.GPVIII(x),gpModel.GPIX(x),gpModel.GPX(x)]
                lstd[sym] = [np.std(v),np.min(v),np.mean(v),np.max(v)]
        
            w = pd.DataFrame.from_dict(wstd,orient='index',columns=['Std','Mi','Mn','Ma'])
            w.index.name = 'Sym'
            w.to_csv('./dailies_test/top_close_250_'+start.strftime("%Y-%m-%d")+'_'+end.strftime("%Y-%m-%d")+'.csv')
            l = pd.DataFrame.from_dict(lstd,orient='index',columns=['Std','Mi','Mn','Ma'])
            l.index.name = 'Sym'
            l.to_csv('./dailies_test/bottom_close_250_'+start.strftime("%Y-%m-%d")+'_'+end.strftime("%Y-%m-%d")+'.csv')
            

  0%|          | 0/305 [00:00<?, ?it/s]

2026-01-22 00:00:00 2026-01-23 00:00:00
Top:
https://uk.finance.yahoo.com/quote/IP,YUM,RTX,HAL,ECL,DLTR,KKR,GE,AVB,ITW,SYY,BA,WAB,O,SO,MAR,HII,VRSN,SNPS,IDXX,DXCM,PM,TDG,PEP,MPC,CTAS,APA,HBAN,TECH,NFLX,ROST,TER,GD/
Bottom:
https://uk.finance.yahoo.com/quote/STT,XYZ,LKQ,JCI,KEYS,SCHW,BAC,RJF,EQT,TSLA,KR,BSX,VRTX,ADI,GPC,REGN,INVH,EME,TT,ON,C,MPWR,DDOG,ANET,PPG,WMT,TXT,AMGN,DVA,BIIB,FSLR,ABNB,EXPD,NVR,PNR,GOOGL,NRG,ORLY,TRV,TSCO,MTB,ABBV,STLD,NVDA,EXE,TTD/


In [11]:
start, end = B[-1]
print(start,end)
bottom = pd.read_csv('./dailies_test/bottom_close_250_'+start.strftime("%Y-%m-%d")+'_'+end.strftime("%Y-%m-%d")+'.csv')
top = pd.read_csv('./dailies_test/top_close_250_'+start.strftime("%Y-%m-%d")+'_'+end.strftime("%Y-%m-%d")+'.csv')
print("Top:")
if(len(top.Sym)):
    print('https://uk.finance.yahoo.com/quote/{0}/'.format(','.join(top.Sym.values)))
else:
    print("No Recommendations")
print("Bottom:")
if(len(bottom.Sym)):
    print('https://uk.finance.yahoo.com/quote/{0}/'.format(','.join(bottom.Sym.values)))
else:
    print("No Recommendations")

2026-01-22 00:00:00 2026-01-23 00:00:00
Top:
https://uk.finance.yahoo.com/quote/IP,YUM,RTX,HAL,ECL,DLTR,KKR,GE,AVB,ITW,SYY,BA,WAB,O,SO,MAR,HII,VRSN,SNPS,IDXX,DXCM,PM,TDG,PEP,MPC,CTAS,APA,HBAN,TECH,NFLX,ROST,TER,GD/
Bottom:
https://uk.finance.yahoo.com/quote/STT,XYZ,LKQ,JCI,KEYS,SCHW,BAC,RJF,EQT,TSLA,KR,BSX,VRTX,ADI,GPC,REGN,INVH,EME,TT,ON,C,MPWR,DDOG,ANET,PPG,WMT,TXT,AMGN,DVA,BIIB,FSLR,ABNB,EXPD,NVR,PNR,GOOGL,NRG,ORLY,TRV,TSCO,MTB,ABBV,STLD,NVDA,EXE,TTD/
