In [None]:
import pandas as pd
import numpy as np

In [None]:
df_finance = pd.read_csv("../feature_extraction/market_features_no_sentiment.csv").set_index("Date")
tickers = pd.read_csv("../feature_extraction/valid_tickers.csv")

tickers = tickers["Ticker"].tolist()

df_finance.columns = pd.MultiIndex.from_tuples(
    [(col.split("_")[0], "_".join(col.split("_")[1:])) for col in df_finance.columns]
)

train_features = ['RSI_14',
 'Close',
 'Volume_Z',
 'Volume',
 'Volatility_21']

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_all, y_all = [], []
window_prediction = 30

for ticker in tickers:
  print(ticker)
  df_ticker_finance = df_finance[ticker]
  df_train = df_ticker_finance[train_features].copy()
  #add wasserstein distance for each ticker
  df_train = df_train.merge(df_finance["Wasserstein"].loc[df_train.index], left_index=True, right_index=True)
  df_train.loc[:, "Target"] = df_ticker_finance["Log_Return"] # predict log return
  
  df_train[train_features] = scaler.fit_transform(df_train[train_features])
  
  X, y = [], []

  for i in range(len(df_train) - window_prediction):
    X.append(df_train[train_features].iloc[i:i+window_prediction].values)
    y.append(df_train['Target'].iloc[i+window_prediction])
  X, y = np.array(X), np.array(y)
  X_all.append(X)
  y_all.append(y)

In [None]:
X_all = np.array(X_all)
y_all = np.array(y_all)

X_all = X_all.reshape(-1, X_all.shape[2], X_all.shape[3])
y_all = y_all.reshape(-1)

In [None]:
np.save("X_all.npy", X_all)
np.save("y_all.npy", y_all)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.1, shuffle=False
)

model = models.Sequential([
    layers.LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam', loss='mse')
history = model.fit(X_all, y_all, epochs=30, batch_size=64, validation_split=0.1, shuffle=False)
