In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from ripser import Rips
import persim
from keras.models import load_model

model = load_model("lstm.keras", compile=False)

In [2]:
tickers_df = pd.read_csv("../feature_extraction/valid_tickers.csv")
tickers = tickers_df["ticker"].tolist()

In [25]:
end_date = datetime.now().date()
start_date = end_date - timedelta(days=61)

tickers_df = pd.read_csv("../feature_extraction/valid_tickers.csv")
tickers = tickers_df["ticker"].tolist()

sp500_data = yf.download(tickers, start=start_date, end=end_date, group_by="ticker")

  sp500_data = yf.download(tickers, start=start_date, end=end_date, group_by="ticker")
[*********************100%***********************]  474 of 474 completed


In [26]:
df_finance = sp500_data.copy()
df_finance.columns = [f"{col[0]}_{col[1]}" for col in df_finance.columns]

df_finance = df_finance.reset_index()
df_finance = df_finance.set_index('Date')

In [27]:
window_rsi = 14
window_vol = 21

def compute_rsi(series, window):
    delta = series.diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    roll_up = pd.Series(gain, index=series.index).rolling(window=window).mean()
    roll_down = pd.Series(loss, index=series.index).rolling(window=window).mean()
    
    RS = np.where(roll_down == 0, np.inf, roll_up / roll_down)
    RSI = 100.0 - (100.0 / (1.0 + RS))
    
    return RSI

for ticker in tickers:
    close_col = f"{ticker}_Close"
    volume_col = f"{ticker}_Volume"

    if close_col not in df_finance.columns or volume_col not in df_finance.columns:
        continue

    df_finance[f"{ticker}_RSI_14"] = compute_rsi(df_finance[close_col], window=window_rsi)
    
    log_return = np.log(1 + df_finance[close_col].pct_change())
    df_finance[f"{ticker}_Log_Return"] = log_return
    df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).std()
    
    df_finance[f"{ticker}_Volume_Z"] = (df_finance[volume_col] - df_finance[volume_col].rolling(window_vol).mean()) / df_finance[volume_col].rolling(window_vol).std()
# Drop first row due to NaN values
df_finance = df_finance.iloc[1:]

  df_finance[f"{ticker}_RSI_14"] = compute_rsi(df_finance[close_col], window=window_rsi)
  df_finance[f"{ticker}_Log_Return"] = log_return
  df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).std()
  df_finance[f"{ticker}_Volume_Z"] = (df_finance[volume_col] - df_finance[volume_col].rolling(window_vol).mean()) / df_finance[volume_col].rolling(window_vol).std()
  df_finance[f"{ticker}_RSI_14"] = compute_rsi(df_finance[close_col], window=window_rsi)
  df_finance[f"{ticker}_Log_Return"] = log_return
  df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).std()
  df_finance[f"{ticker}_Volume_Z"] = (df_finance[volume_col] - df_finance[volume_col].rolling(window_vol).mean()) / df_finance[volume_col].rolling(window_vol).std()
  df_finance[f"{ticker}_RSI_14"] = compute_rsi(df_finance[close_col], window=window_rsi)
  df_finance[f"{ticker}_Log_Return"] = log_return
  df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).s

  df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).std()
  df_finance[f"{ticker}_Volume_Z"] = (df_finance[volume_col] - df_finance[volume_col].rolling(window_vol).mean()) / df_finance[volume_col].rolling(window_vol).std()
  df_finance[f"{ticker}_RSI_14"] = compute_rsi(df_finance[close_col], window=window_rsi)
  df_finance[f"{ticker}_Log_Return"] = log_return
  df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).std()
  df_finance[f"{ticker}_Volume_Z"] = (df_finance[volume_col] - df_finance[volume_col].rolling(window_vol).mean()) / df_finance[volume_col].rolling(window_vol).std()
  df_finance[f"{ticker}_RSI_14"] = compute_rsi(df_finance[close_col], window=window_rsi)
  df_finance[f"{ticker}_Log_Return"] = log_return
  df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).std()
  df_finance[f"{ticker}_Volume_Z"] = (df_finance[volume_col] - df_finance[volume_col].rolling(window_vol).mean()) / df_finance[volume_

In [31]:
window_wasserstein = 30
eps = 0.5  
maxdim = 2 
rips = Rips(maxdim=maxdim, verbose=False)

def betti_numbers_at_scale(diagrams, eps):
    bettis = []
    for dgm in diagrams:
        alive = np.sum((dgm[:, 0] <= eps) & (dgm[:, 1] > eps))
        bettis.append(int(alive))
    return bettis

first_ticker = list(tickers)[0]

log_returns = df_finance[f"{first_ticker}_Log_Return"].dropna().values

persistences = []
betti_list = []
wasserstein_list = []

for start in range(0, len(log_returns) - window_wasserstein + 1):
    window_data = log_returns[start:start + window_wasserstein]
    diagrams = rips.fit_transform(window_data)
    persistences.append(diagrams)

    bettis = betti_numbers_at_scale(diagrams, eps=eps)
    while len(bettis) < 3:
        bettis.append(0)
    betti_list.append(bettis)

for i in range(1, len(persistences)):
    dgm_prev = persistences[i - 1][1]
    dgm_curr = persistences[i][1]
    W = persim.wasserstein(dgm_prev, dgm_curr)
    wasserstein_list.append(W)

betti_df = pd.DataFrame(
    betti_list,
    columns=["Betti0", "Betti1", "Betti2"],
    index=df_finance.index[window_wasserstein - 1:]
)
wasserstein_df = pd.DataFrame(
    {"Wasserstein": wasserstein_list},
    index=df_finance.index[window_wasserstein:]
)

df_finance["Wasserstein"] = wasserstein_df.reindex(df_finance.index)["Wasserstein"]

for col in ["Betti0", "Betti1", "Betti2"]:
    df_finance[col] = betti_df.reindex(df_finance.index)[col]

IndexError: tuple index out of range

In [None]:
df_finance = df_finance[sorted(df_finance.columns)]
df_finance

Unnamed: 0_level_0,AAPL_Close,AAPL_High,AAPL_Log_Return,AAPL_Low,AAPL_Open,AAPL_RSI_14,AAPL_Volatility_21,AAPL_Volume,AAPL_Volume_Z,ABBV_Close,...,ZBRA_Volume_Z,ZTS_Close,ZTS_High,ZTS_Log_Return,ZTS_Low,ZTS_Open,ZTS_RSI_14,ZTS_Volatility_21,ZTS_Volume,ZTS_Volume_Z
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-09-16,238.149994,241.220001,0.006107,236.320007,237.179993,,,63421100,,214.712112,...,,147.460007,148.570007,0.00824,146.289993,146.289993,,,2664900,
2025-09-17,238.990005,240.100006,0.003521,237.729996,238.970001,,,46508000,,219.229553,...,,146.929993,149.539993,-0.003601,146.559998,147.5,,,1958300,
2025-09-18,237.880005,241.199997,-0.004655,236.649994,239.970001,,,44249600,,220.401108,...,,147.100006,148.039993,0.001156,145.149994,146.300003,,,2739400,
2025-09-19,245.5,246.300003,0.031531,240.210007,241.229996,,,163741300,,220.87767,...,,145.880005,147.75,-0.008328,145.300003,147.75,,,4455800,
2025-09-22,256.079987,256.640015,0.042193,248.119995,248.300003,,,105517400,,220.996811,...,,144.649994,146.710007,-0.008467,144.350006,146.550003,,,2044700,
2025-09-23,254.429993,257.339996,-0.006464,253.580002,255.880005,,,60275200,,221.016663,...,,142.610001,146.179993,-0.014203,141.529999,143.25,,,3679500,
2025-09-24,252.309998,255.740005,-0.008367,251.039993,255.220001,,,42303700,,218.415421,...,,141.669998,144.009995,-0.006613,140.539993,141.619995,,,4224500,
2025-09-25,256.869995,257.170013,0.017912,251.710007,253.210007,,,55202100,,216.9758,...,,141.130005,142.0,-0.003819,139.339996,141.330002,,,3058500,
2025-09-26,255.460007,257.600006,-0.005504,253.779999,254.100006,,,46076300,,219.030991,...,,143.5,143.789993,0.016654,141.270004,141.860001,,,2575200,
2025-09-29,254.429993,255.0,-0.00404,253.009995,254.559998,,,40127700,,221.562729,...,,143.059998,144.149994,-0.003071,142.5,143.860001,,,2870100,


In [None]:
df_finance.columns = pd.MultiIndex.from_tuples(
    [(col.split("_")[0], "_".join(col.split("_")[1:])) for col in df_finance.columns]
)

train_features = ['Close', 'High', 'Low', 'Open', 'RSI_14', 'Volatility_21', 'Volume', 'Volume_Z', 'Sentiment']

scaler = MinMaxScaler()

In [None]:
df_finance

Unnamed: 0_level_0,ATO,ATO,ATO,ATO,ATO,FIS,FIS,FIS,FIS,FIS,...,BK,BK,BBY,BBY,BBY,BBY,Wasserstein,Betti0,Betti1,Betti2
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Volatility_21,Volume_Z,RSI_14,Log_Return,Volatility_21,Volume_Z,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-09-16,164.610001,165.75,162.899994,163.210007,1182300,66.730003,66.949997,65.660004,66.440002,2989700,...,,,,-0.006865,,,,,,
2025-09-17,163.570007,164.869995,162.529999,163.449997,1052400,66.540001,68.150002,66.32,67.150002,4000800,...,,,,0.006462,,,,,,
2025-09-18,161.130005,163.119995,160.100006,162.440002,1004300,67.25,67.32,66.059998,66.239998,3720900,...,,,,0.002172,,,,,,
2025-09-19,162.779999,163.940002,161.449997,162.929993,1605700,66.599998,66.959999,65.029999,65.110001,6951500,...,,,,-0.020554,,,,,,
2025-09-22,162.720001,163.600006,162.199997,162.960007,778100,64.900002,65.080002,64.260002,64.650002,5799900,...,,,,0.001383,,,,,,
2025-09-23,162.710007,166.490005,162.699997,166.339996,822200,64.580002,64.650002,63.200001,63.369999,4738100,...,,,,0.009357,,,,,,
2025-09-24,166.339996,167.240005,166.080002,166.770004,621500,63.209999,64.300003,63.0,64.129997,4875300,...,,,,0.015763,,,,,,
2025-09-25,166.919998,168.860001,166.0,166.460007,758700,64.120003,65.0,63.650002,63.790001,4264900,...,,,,0.00027,,,,,,
2025-09-26,167.679993,169.960007,167.039993,168.449997,687300,63.889999,64.339996,63.66,64.239998,4640500,...,,,,0.026076,,,,,,
2025-09-29,168.899994,169.699997,167.380005,169.550003,753100,64.480003,66.0,64.110001,65.779999,4817800,...,,,,-0.006455,,,,,,


In [None]:
# Cell 6: Make predictions for each ticker
window_prediction = 30
predictions_dict = {}

for ticker in available_tickers[:10]:  # Process first 10 tickers as example
    try:
        print(f"Processing {ticker}...")
        
        # Get ticker data
        df_ticker_finance = df_finance[ticker]
        
        # Check if all required features are available
        available_features = [f for f in train_features if f in df_ticker_finance.columns]
        
        if len(available_features) < len(train_features):
            print(f"Skipping {ticker}: Missing features {set(train_features) - set(available_features)}")
            continue
            
        df_train = df_ticker_finance[available_features].copy()
        
        # Add Wasserstein distance
        df_train = df_train.merge(df_finance["Wasserstein"].loc[df_train.index], left_index=True, right_index=True)
        
        # Scale features
        df_train[available_features] = scaler.fit_transform(df_train[available_features])
        
        # Prepare sequences for prediction
        if len(df_train) >= window_prediction:
            # Get the most recent sequence
            last_sequence = df_train[available_features].iloc[-window_prediction:].values
            last_sequence = last_sequence.reshape(1, window_prediction, len(available_features))
            
            # Make prediction
            prediction = model.predict(last_sequence, verbose=0)[0][0]
            
            # Get the latest actual log return for comparison
            latest_log_return = df_ticker_finance["Log_Return"].iloc[-1]
            
            predictions_dict[ticker] = {
                'predicted_log_return': prediction,
                'latest_actual_log_return': latest_log_return,
                'prediction_date': df_train.index[-1]
            }
            
            print(f"{ticker}: Predicted = {prediction:.6f}, Actual = {latest_log_return:.6f}")
        else:
            print(f"Skipping {ticker}: Not enough data points")
            
    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        continue

print(f"\nPredictions completed for {len(predictions_dict)} tickers")

In [None]:
# Cell 7: Display results and save predictions
# Create results dataframe
results_df = pd.DataFrame.from_dict(predictions_dict, orient='index')
results_df = results_df.sort_values('predicted_log_return', ascending=False)

print("Top 10 predicted positive returns:")
print(results_df.head(10))

print("\nTop 10 predicted negative returns:")
print(results_df.tail(10))

# Save predictions
results_df.to_csv("latest_predictions.csv")
print(f"\nPredictions saved to latest_predictions.csv")

# Display summary statistics
print(f"\nSummary Statistics:")
print(f"Average predicted return: {results_df['predicted_log_return'].mean():.6f}")
print(f"Standard deviation: {results_df['predicted_log_return'].std():.6f}")
print(f"Min predicted return: {results_df['predicted_log_return'].min():.6f}")
print(f"Max predicted return: {results_df['predicted_log_return'].max():.6f}")