In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
#from google.colab import drive
import pandas as pd
import yfinance

In [None]:
file_path_tda = 'tda_features.csv' #'/content/drive/My Drive/LSTM/tda_features.csv'
file_path_finance = '../data_ingestion/sp500_data.csv' #'/content/drive/My Drive/LSTM/sp500_data.csv'
file_path_ticker = '../sp500.csv' #'/content/drive/My Drive/LSTM/sp500.csv'

In [None]:
df_finance = pd.read_csv(file_path_finance, header=[0,1])
df_finance = df_finance.drop(0)

#clear tickers with nan values like onse that ipoed inside time window
temp_tickers = df_finance.columns.get_level_values(0).unique()[1:]
tickers_with_nan = []
for ticker in temp_tickers:
  if df_finance[ticker].isnull().any().any():
    tickers_with_nan.append(ticker)
df_finance.drop(columns=tickers_with_nan, inplace=True)

#fix df format
df_finance.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_finance.columns]
df_finance = df_finance.rename(columns={"Ticker_Price": "Date"})
df_finance = df_finance.set_index("Date")


In [None]:
"""
#combining tda and finance data

df_tda["Date"] = df_tda["date"].str.replace(' 00:00:00', '', regex=False)
df_tda.drop(columns=["date"], inplace=True)
df_tda.set_index("Date", inplace=True)
"""

In [None]:
#features
import warnings
warnings.filterwarnings('ignore')

window_rsi = 14
window_vol = 21

def compute_rsi(series, window):
    delta = series.diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    roll_up = pd.Series(gain, index=series.index).rolling(window=window).mean()
    roll_down = pd.Series(loss, index=series.index).rolling(window=window).mean()
    
    RS = np.where(roll_down == 0, np.inf, roll_up / roll_down)
    RSI = 100.0 - (100.0 / (1.0 + RS))
    
    return RSI

tickers = set(col.split("_")[0] for col in df_finance.columns)
for ticker in tickers:
    close_col = f"{ticker}_Close"
    volume_col = f"{ticker}_Volume"

    if close_col not in df_finance.columns or volume_col not in df_finance.columns:
        continue

    df_finance[f"{ticker}_RSI_14"] = compute_rsi(df_finance[close_col], window=window_rsi)

    log_return = np.log(1 + df_finance[close_col].pct_change())
    df_finance[f"{ticker}_Log_Return"] = log_return
    df_finance[f"{ticker}_Volatility_21"] = log_return.rolling(window=window_vol).std()
    
    df_finance[f"{ticker}_Volume_Z"] = (df_finance[volume_col] - df_finance[volume_col].rolling(window_vol).mean()) / df_finance[volume_col].rolling(window_vol).std()

#drop first row bc log return nan values
df_finance = df_finance.iloc[1:]

In [None]:
"""
#see if any nan in df
for i in range(len(df_finance)):
    row = df_finance.iloc[i]
    nan_cols = row.index[row.isnull()]
    if len(nan_cols) > 0:
        print(f"Row {i} has NaN in columns: {list(nan_cols)}")
"""

# TDA

In [None]:
from ripser import Rips
import persim
import matplotlib.pyplot as plt

In [None]:
log_returns = np.array([df_finance[f"{ticker}_Log_Return"] for ticker in tickers])
log_returns = log_returns.T

In [None]:
"""
rips = Rips(maxdim = 2) # max betti number value b2

dgm = rips.fit_transform(log_returns[0:50])

plt.figure(figsize=(5, 5), dpi=80)
plt.rcParams.update({'font.size': 10})
#persim.plot_diagrams(dgm, title="Persistence Diagram")

plt.savefig("images/homology_example_persistence-diagram.png", dpi='figure', format=None, metadata=None,
        bbox_inches=None, pad_inches=0.1,
        facecolor='white', edgecolor='auto')
"""

In [None]:
df_finance.head()

In [None]:
window_wasserstein = 30
eps = 0.5  
maxdim = 2 
rips = Rips(maxdim=maxdim, verbose=False)

def betti_numbers_at_scale(diagrams, eps):
    bettis = []
    for dgm in diagrams:
        alive = np.sum((dgm[:, 0] <= eps) & (dgm[:, 1] > eps))
        bettis.append(int(alive))
    return bettis

persistences = []
betti_list = []
wasserstein_list = []

for start in range(0, len(log_returns) - window_wasserstein + 1):
    window_data = log_returns[start:start + window_wasserstein]
    print(window_data.shape)
    diagrams = rips.fit_transform(window_data)
    persistences.append(diagrams)

    bettis = betti_numbers_at_scale(diagrams, eps=eps)
    while len(bettis) < 3:
        bettis.append(0)
    betti_list.append(bettis)

# Compute Wasserstein distances between consecutive windows (H1 diagrams)
for i in range(1, len(persistences)):
    dgm_prev = persistences[i - 1][1]
    dgm_curr = persistences[i][1]
    W = persim.wasserstein(dgm_prev, dgm_curr)
    wasserstein_list.append(W)

betti_df = pd.DataFrame(
    betti_list,
    columns=["Betti0", "Betti1", "Betti2"],
    index=df_finance.index[window_wasserstein - 1:]
)
wasserstein_df = pd.DataFrame(
    {"Wasserstein": wasserstein_list},
    index=df_finance.index[window_wasserstein:]
)

df_finance["Wasserstein"] = wasserstein_df.reindex(df_finance.index)["Wasserstein"]

# Clean Nan

In [None]:
#extra cleaning Nans
biggest_cut_off = max(window_rsi, window_vol, window_wasserstein)
df_finance = df_finance[biggest_cut_off:]
df_finance = df_finance.reindex(sorted(df_finance.columns), axis=1)

In [None]:
df_finance["Wasserstein"]

In [None]:
"""
df_finance.to_csv("market_features_no_sentiment.csv")
"""

In [None]:
tickers.rename(columns={0: "Ticker"}, inplace=True)

tickers = pd.DataFrame(tickers)
tickers.reset_index(drop=True)
tickers.to_csv("valid_tickers.csv")