In [52]:
import pandas as pd
from pathlib import Path

In [53]:
# set path to data file
notebook_dir = Path.cwd()
mover_file = notebook_dir / 'data' / 'SPX_index leavers & joiners_11-Nov-2025.xlsx'
returns_file = notebook_dir / 'data' / 'beginnings.csv'

In [54]:
returns = pd.read_csv(returns_file, parse_dates=['Date'])

In [55]:
returns['Date'] = pd.to_datetime(returns['Date'], format='%Y-%m-%d')
returns = returns[returns['Date'] >= '1995-01-01']
returns.set_index(['Date'], inplace=True)
returns

Unnamed: 0_level_0,A.N,AA.N,AABA.OQ^J19,AAL.N^B97,AAL.OQ,AAP.N,AAPL.OQ,ABBV.N,ABI.N^K08,ABK.N^K10,...,JWN.N^E25,K.N,KATE.N^G17,KBH.N,KD.N,KDP.OQ,KEY.N,KEYS.N,KG.N^C11,KHC.OQ
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1995-01-03,,,,19.125,,,0.342634,,6.56250,12.166667,...,,27.270216,8.5625,6.5625,,,12.6250,,,
1995-01-04,,,,19.000,,,0.351562,,6.46875,12.333333,...,,27.328861,8.8750,6.5000,,,12.6250,,,
1995-01-05,,,,18.625,,,0.347098,,6.53125,12.333333,...,,27.504798,9.0000,6.6875,,,12.7500,,,
1995-01-06,,,,18.750,,,0.375000,,6.46875,12.250000,...,,27.035633,9.0000,6.6875,,,12.8125,,,
1995-01-09,,,,18.750,,,0.367885,,6.53125,12.333333,...,,26.859696,8.9375,6.8750,,,12.8750,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,136.10,38.61,,,17.35,43.73,258.200000,180.00,,,...,24.21,80.820000,,66.4000,34.99,32.45,17.3700,163.86,,30.64
2024-12-26,135.83,38.35,,,17.35,44.24,259.020000,179.20,,,...,24.22,81.110000,,66.2700,35.43,32.31,17.3900,164.63,,30.55
2024-12-27,135.54,37.68,,,17.35,44.06,255.590000,178.01,,,...,24.21,81.170000,,65.6900,34.83,32.27,17.2000,162.91,,30.68
2024-12-30,134.42,37.15,,,17.62,46.05,252.200000,176.20,,,...,24.11,80.820000,,65.5800,34.53,31.96,17.1100,161.17,,30.36


In [56]:
movers = pd.read_excel(mover_file, header=None)
movers.columns = ['Status', 'Issuer', 'Code', 'Date']
display(movers.head())

Unnamed: 0,Status,Issuer,Code,Date
0,Leaver,Eastman Chemical,EMN.N,2025-11-04
1,Joiner,Qnity Electronic,Q.N,2025-11-03
2,Leaver,Carmax,KMX.N,2025-10-31
3,Joiner,Solstice Advance,SOLS.OQ,2025-10-30
4,Joiner,Applovin,APP.OQ,2025-09-22


In [57]:
movers['Date'] = pd.to_datetime(movers['Date'])
movers.sort_values(by='Date', inplace=True)
display(movers.head(5))

Unnamed: 0,Status,Issuer,Code,Date
2120,Joiner,eSystems,ESY.N^E95 (expired),1994-12-30
1789,Joiner,Ecolab,ECL.N,1994-12-30
1790,Joiner,Edison Intl,EIX.N,1994-12-30
1791,Joiner,El Paso CGP,CGP.N^A01 (expired),1994-12-30
1792,Joiner,Emerson Electric,EMR.N,1994-12-30


In [58]:
unique_codes = movers['Code'].unique()
unique_codes = [t.replace("(expired)", "").strip() for t in unique_codes]


In [59]:
december_1994_data = movers[movers['Date'].dt.to_period('M') == '1994-12']
current_sp500_composition = set(december_1994_data['Code'])
print(f"Size of the initial S&P 500 composition set: {len(current_sp500_composition)}")

Size of the initial S&P 500 composition set: 483


In [60]:
sp500_composition_over_time = {}

for index, row in movers.iterrows():
    date = row['Date']
    status = row['Status']
    code = row['Code']
    code = code.replace("(expired)", "").strip()

    if status == 'Joiner':
        current_sp500_composition.add(code)
    elif status == 'Leaver':
        try:
            current_sp500_composition.remove(code)
        except KeyError:
            # Handle cases where a leaver is not found in the current composition
            pass

    # Optionally store the composition at this date
    sp500_composition_over_time[date] = set(current_sp500_composition)

print(f"Number of dates with composition changes recorded: {len(sp500_composition_over_time)}")

Number of dates with composition changes recorded: 742


In [61]:
dates = sorted(sp500_composition_over_time.keys())
unique_months = sorted(list(set(date.to_period('M') for date in dates)))

monthly_sp500_composition = {}

for month in unique_months:
    latest_date_in_month = None
    for date in dates:
        if date.to_period('M') == month:
            latest_date_in_month = date

    if latest_date_in_month is not None:
        monthly_sp500_composition[month] = sp500_composition_over_time[latest_date_in_month]

print(f"Number of months with recorded composition: {len(monthly_sp500_composition)}")

Number of months with recorded composition: 309


In [62]:
all_codes = sorted(list(unique_codes))
# Generate all months from the first date in the data to the last date
start_month = movers['Date'].min().to_period('M')
end_month = movers['Date'].max().to_period('M')
all_months = pd.period_range(start=start_month, end=end_month, freq='M').tolist()


data = []
current_composition = set(december_1994_data['Code']) # Initialize with the December 1994 composition

# Recreate the sp500_composition_over_time dictionary with daily compositions
sp500_composition_over_time = {}
for index, row in movers.iterrows():
    date = row['Date']
    status = row['Status']
    code = row['Code']
    code = code.replace("(expired)", "").strip()

    if status == 'Joiner':
        current_composition.add(code)
    elif status == 'Leaver':
        try:
            current_composition.remove(code)
        except KeyError:
            pass # Handle cases where a leaver is not found in the current composition

    sp500_composition_over_time[date] = set(current_composition)


# Iterate through all months and find the latest composition for each
monthly_sp500_composition = {}
dates_with_composition = sorted(sp500_composition_over_time.keys())

for month in all_months:
    latest_date_in_month = None
    # Find the latest date in sp500_composition_over_time that is in the current month
    for date in reversed(dates_with_composition): # Iterate in reverse to find the latest date quickly
        if date.to_period('M') == month:
            latest_date_in_month = date
            break

    # If a date with composition is found in the month, use its composition
    if latest_date_in_month is not None:
        monthly_sp500_composition[month] = sp500_composition_over_time[latest_date_in_month]
    else:
        # If no date with composition change in the current month,
        # use the composition from the previous month's last recorded change.
        # This assumes the composition remains unchanged unless a joiner/leaver occurs.
        previous_month_composition = None
        for date in reversed(dates_with_composition):
             if date < month.start_time:
                previous_month_composition = sp500_composition_over_time[date]
                break
        if previous_month_composition is not None:
             monthly_sp500_composition[month] = previous_month_composition
        else:
            # This case should ideally not happen if the data starts in Dec 1994
            # but as a fallback, use the initial composition
             monthly_sp500_composition[month] = set(december_1994_data['Code'])


data = []
for month in all_months:
    month_data = {'Month': month.to_timestamp()}
    current_month_composition = monthly_sp500_composition.get(month, set())
    for code in all_codes:
        month_data[code] = 1 if code in current_month_composition else 0
    data.append(month_data)


sp500_composition_df = pd.DataFrame(data)
sp500_composition_df.set_index('Month', inplace=True)
display(sp500_composition_df.head())

Unnamed: 0_level_0,A.N,AA.N,AABA.OQ^J19,AAL.N^B97,AAL.OQ,AAP.N,AAPL.OQ,ABBV.N,ABI.N^K08,ABK.N^K10,...,YNR.N^J00,YUM.N,YUMC.N,ZBH.N,ZBRA.OQ,ZE.N^E98,ZIMV.OQ^J25,ZION.OQ,ZRN.N^F98,ZTS.N
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-12-01,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1995-01-01,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1995-02-01,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1995-03-01,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1995-04-01,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0


Check which stocks are part of the index at the end of the training period. These stocks will be considered for training and testing. We get all available return data for these stocks in the 4 years of the study period (also if they later joined S&P500, we get their price data for the whole training period). Also, if companies leave S&P500, we still use them in the test set as long as we have price data available.

In [63]:
i = 0
study_periods_folder = notebook_dir / 'data' / 'study_periods'
for year in range(1997, 2024):
    i += 1
    study_period_series = []  # Collect series here
    closest_date = sp500_composition_df.index.get_indexer([pd.Timestamp(f'{year}-12-01')], method='nearest')[0]
    study_period_stocks = sp500_composition_df.iloc[closest_date]
    # only keep 1s
    study_period_stocks = study_period_stocks[study_period_stocks == 1] 
    study_period_stocks = study_period_stocks.index.tolist()
    # get returns for each constituent for the last 3 years and current year
    for stock in study_period_stocks:
        try:
            stock_returns = returns[stock][f'{year-2}-01-01':f'{year+1}-12-31']
            study_period_series.append(stock_returns.rename(stock))  # Rename to stock ticker
        except KeyError:
            continue
    # Concat all series at once
    study_period = pd.concat(study_period_series, axis=1)
    study_period.to_csv(study_periods_folder / f'test_{year+1}.csv')

print(f"Generated {i} study periods.")

Generated 27 study periods.


In [92]:
data = pd.read_csv(study_periods_folder / 'test_2005.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

In [93]:
missing_value_stocks = data.columns[data.isna().sum(axis=0) > 0]
# print number of na values for stocks with missing values
for stock in missing_value_stocks:
    print(f"{stock}: {data[stock].isna().sum()} missing values")
    if data[stock].isna().sum() == len(data):
        # delete column
        data.drop(columns=stock, inplace=True)
        print('Deleted column:', stock)

AMCR.N: 1008 missing values
Deleted column: AMCR.N
CIT.N^K09: 125 missing values
CPN.N^L05: 18 missing values
DAL.N^J05: 55 missing values
DPH.N^J05: 57 missing values
FSLb.N^L06: 729 missing values
G.N^J05: 63 missing values
GENZ.OQ^D11: 1 missing values
GILD.OQ: 1 missing values
GLK.N^G05: 126 missing values
GP.N^L05: 5 missing values
HSP.N^I15: 578 missing values


NOTE: If missing values are >250, they cannot occur at the very end. For stocks with missing values, either only 1000 - #NA sequences are created (if missing at the beginning) or only #days_of_year - #NA predictions will be generated (if missing at the end)

In [94]:
# create returns out of the prices
returns_data = data.pct_change()
returns_data = returns_data[1:]
returns_data

  returns_data = data.pct_change()


Unnamed: 0_level_0,A.N,AABA.OQ^J19,AAPL.OQ,ABI.N^K08,ABK.N^K10,ABS.N^F06,ABT.N,ACS.N^B10,ACV.N^K06,ADBE.OQ,...,JNS.N^E17,JNY.N^D14,JP.N^D06,JPM.N,JWN.N^E25,K.N,KATE.N^G17,KBH.N,KEY.N,KG.N^C11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-01-03,0.063248,0.026838,0.012017,-0.051053,-0.007031,-0.002534,0.000895,-0.021382,0.003868,0.038945,...,0.021466,0.003322,0.003272,0.026685,0.035158,0.000667,0.023023,-0.011367,0.004082,-0.025788
2002-01-04,0.054019,-0.012023,0.004664,0.046866,0.007427,0.011115,-0.001789,0.017188,-0.005213,0.085248,...,0.047464,0.035220,0.002392,0.045016,0.021349,-0.021340,0.027789,-0.003321,0.007724,-0.019608
2002-01-07,-0.003966,0.043915,-0.033347,-0.009272,-0.007543,-0.016960,-0.006990,-0.007160,-0.010025,0.007242,...,-0.007610,-0.016865,-0.010195,-0.002564,-0.011401,0.003407,0.009139,-0.001282,-0.001614,0.004500
2002-01-08,0.003063,-0.010137,-0.012662,-0.050802,-0.012956,-0.021406,-0.006498,0.017308,-0.007135,0.013274,...,-0.015336,0.013014,0.006355,-0.007712,0.074964,0.012224,-0.000189,-0.013090,-0.006869,0.009209
2002-01-09,-0.023817,0.036866,-0.042459,0.021127,0.010326,-0.007835,-0.005451,0.020605,0.010431,0.027020,...,0.014159,-0.018394,0.008711,0.002850,-0.033974,-0.002348,-0.004718,0.014564,-0.002441,-0.015783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-12-23,-0.004847,-0.005143,-0.009052,-0.011940,0.007436,-0.117698,0.002946,0.050275,0.004354,-0.009079,...,0.007483,0.004557,0.004545,0.006492,0.003209,-0.005185,0.015710,-0.013898,0.000592,-0.001763
2005-12-27,-0.026361,-0.016741,0.011997,-0.003021,-0.005981,0.013632,-0.011258,-0.021639,-0.011706,-0.004581,...,-0.008488,-0.004213,-0.003481,-0.005210,-0.018662,-0.006572,-0.001969,-0.003356,-0.010947,0.019423
2005-12-28,-0.002354,0.007261,-0.008892,0.013258,-0.006657,0.013449,-0.001238,-0.003854,0.012064,-0.001895,...,0.003210,0.005857,0.003493,-0.004738,0.022277,-0.001597,0.014088,-0.017374,-0.004786,-0.002309
2005-12-29,-0.012684,-0.016654,-0.028816,0.001495,-0.000773,0.013744,-0.013631,-0.002523,0.000650,0.002983,...,0.001067,-0.006147,-0.005743,-0.001253,-0.004252,-0.006626,-0.003890,0.002193,-0.003306,-0.017361


In [95]:
# Only use data from first 3 years for normalization
start_date = returns_data.index.min().year
end_date = start_date + 2
returns_3years = returns_data[f'{start_date}-01-01':f'{end_date}-12-31']
mean_return = returns_3years.mean().mean()
mean_volatility = returns_3years.std().mean()
returns_data = (returns_data - mean_return) / mean_volatility

In [103]:
# stack returns in one large vector
stacked_returns = returns_data.T.stack().reset_index()
stacked_returns.columns = ['stock', 'date', 'return']
stacked_returns

Unnamed: 0,stock,date,return
0,A.N,2002-01-03,2.739748
1,A.N,2002-01-04,2.336460
2,A.N,2002-01-07,-0.197491
3,A.N,2002-01-08,0.109663
4,A.N,2002-01-09,-1.064977
...,...,...,...
258369,KG.N^C11,2005-12-23,-0.101209
258370,KG.N^C11,2005-12-27,0.824611
258371,KG.N^C11,2005-12-28,-0.125106
258372,KG.N^C11,2005-12-29,-0.782862


In [108]:
window = 240  # number of past days in each sequence

# Group by stock and create rolling windows
sequences = []

for stock, group in stacked_returns.groupby('stock'):
    vals = group['return'].values
    
    # create rolling sequences of length 240
    for i in range(window, len(vals)):
        seq = vals[i-window:i]  # past 240 returns
        sequences.append({
            'stock': stock,
            'date': group['date'].iloc[i],  # the "current" date (t)
            'return': vals[i],  # the return at time t
            'sequence': seq
        })

# Convert to DataFrame
sequences_df = pd.DataFrame(sequences)

In [109]:
sequences_df

Unnamed: 0,stock,date,return,sequence
0,A.N,2002-12-16,1.042911,"[2.7397480610005727, 2.3364597617248255, -0.19..."
1,A.N,2002-12-17,-0.569814,"[2.3364597617248255, -0.1974907018107239, 0.10..."
2,A.N,2002-12-18,-1.405504,"[-0.1974907018107239, 0.1096632485976826, -1.0..."
3,A.N,2002-12-19,-0.257593,"[0.1096632485976826, -1.0649772347946689, -0.3..."
4,A.N,2002-12-20,3.208991,"[-1.0649772347946689, -0.36590736053518036, -0..."
...,...,...,...,...
196449,KG.N^C11,2005-12-23,-0.101209,"[-3.0128376956664535, -0.7675122093278653, 0.3..."
196450,KG.N^C11,2005-12-27,0.824611,"[-0.7675122093278653, 0.3340141846843717, -0.3..."
196451,KG.N^C11,2005-12-28,-0.125106,"[0.3340141846843717, -0.3794666513231489, -0.2..."
196452,KG.N^C11,2005-12-29,-0.782862,"[-0.3794666513231489, -0.22318037295959992, -0..."


for each date, get the returns for all stocks and create two targets: below median and above median return
