In [None]:
import pandas as pd
import numpy as np

In [None]:
df_finance = pd.read_csv("../feature_extraction/market_features.csv").set_index("Date")
tickers = pd.read_csv("../feature_extraction/valid_tickers.csv")

tickers = tickers["ticker"].tolist()

df_finance.columns = pd.MultiIndex.from_tuples(
    [(col.split("_")[0], "_".join(col.split("_")[1:])) for col in df_finance.columns]
)

train_features = ['Close', 'High', 'Low', 'Open', 'RSI_14', 'Volatility_21', 'Volume', 'Volume_Z', 'Sentiment',]

In [None]:
test_df_finance = df_finance.copy()
available_tickers = df_finance.columns.get_level_values(0).unique()

valid_tickers = []

for ticker in available_tickers:
    if (ticker, 'Sentiment') not in test_df_finance.columns:
      print(f"Ticker {ticker} missing Sentiment column, removing.")
    else:
      valid_tickers.append(ticker)

df_finance = df_finance[valid_tickers + ['Wasserstein']]

In [40]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

scaler = MinMaxScaler()
window_prediction = 30

train_ratio = 0.8

X_train_all, X_test_all, y_train_all, y_test_all = [], [], [], []

all_train_data_for_scaling = []

for ticker in tickers:
  try:
    df_ticker_finance = df_finance[ticker]
    df_train = df_ticker_finance[train_features].copy()

    df_train = df_train.merge(df_finance["Wasserstein"].loc[df_train.index], left_index=True, right_index=True)
    
    train_size = int(len(df_train) * train_ratio)
    df_train_split = df_train.iloc[:train_size]
    df_test_split = df_train.iloc[train_size:]
    
    all_train_data_for_scaling.append(df_train_split[train_features])
    
  except Exception as e:
    print(f"Error collecting data for {ticker}: {e}")
    continue

if all_train_data_for_scaling:
    combined_train_data = pd.concat(all_train_data_for_scaling, ignore_index=True)
    scaler.fit(combined_train_data[train_features])
    print(f"Scaler fitted on {len(combined_train_data)} training samples")
else:
    print("No training data collected for scaling!")

for ticker in tickers:
  try:
    df_ticker_finance = df_finance[ticker]
    df_train = df_ticker_finance[train_features].copy()

    df_train = df_train.merge(df_finance["Wasserstein"].loc[df_train.index], left_index=True, right_index=True)
    df_train.loc[:, "Target"] = df_ticker_finance["Log_Return"]

    train_size = int(len(df_train) * train_ratio)
    df_train_split = df_train.iloc[:train_size]
    df_test_split = df_train.iloc[train_size:]
    
    df_train_split[train_features] = scaler.transform(df_train_split[train_features])
    df_test_split[train_features] = scaler.transform(df_test_split[train_features])
    
  except Exception as e:
    print(f"Error processing {ticker}: {e}")
    continue
  
  # Create sequences for training data
  X_train, y_train = [], []
  for i in range(len(df_train_split) - window_prediction):
    X_train.append(df_train_split[train_features].iloc[i:i+window_prediction].values)
    y_train.append(df_train_split['Target'].iloc[i+window_prediction])
  
  # Create sequences for test data
  X_test, y_test = [], []
  for i in range(len(df_test_split) - window_prediction):
    X_test.append(df_test_split[train_features].iloc[i:i+window_prediction].values)
    y_test.append(df_test_split['Target'].iloc[i+window_prediction])
  
  if len(X_train) > 0 and len(X_test) > 0:
    X_train_all.append(np.array(X_train))
    y_train_all.append(np.array(y_train))
    X_test_all.append(np.array(X_test))
    y_test_all.append(np.array(y_test))

Error collecting data for IBM: 'IBM'
Error collecting data for MMM: 'MMM'
Error collecting data for IEX: 'IEX'
Error collecting data for ZTS: 'ZTS'
Error collecting data for T: 'T'
Scaler fitted on 705845 training samples


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.transform(df_train_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split[train_features] = scaler.transform(df_test_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.tra

Error processing IBM: 'IBM'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.transform(df_train_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split[train_features] = scaler.transform(df_test_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.tra

Error processing MMM: 'MMM'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.transform(df_train_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split[train_features] = scaler.transform(df_test_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.tra

Error processing IEX: 'IEX'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.transform(df_train_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split[train_features] = scaler.transform(df_test_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.tra

Error processing ZTS: 'ZTS'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.transform(df_train_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split[train_features] = scaler.transform(df_test_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.tra

Error processing T: 'T'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.transform(df_train_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split[train_features] = scaler.transform(df_test_split[train_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_split[train_features] = scaler.tra

In [41]:
X_train_all = np.array(X_train_all)
X_test_all = np.array(X_test_all)
y_train_all = np.array(y_train_all)
y_test_all = np.array(y_test_all)

X_train_all = X_train_all.reshape(-1, X_train_all.shape[2], X_train_all.shape[3])
X_test_all = X_test_all.reshape(-1, X_test_all.shape[2], X_test_all.shape[3])
y_train_all = y_train_all.reshape(-1)
y_test_all = y_test_all.reshape(-1)

In [44]:
np.save("X_all.npy", X_train_all)
np.save("X_test_all.npy", X_test_all)
np.save("y_all.npy", y_train_all)
np.save("y_test_all.npy", y_test_all)

import joblib

joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [49]:
np.mean(y_train_all)


np.float64(0.00042740508848008877)