In [5]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", UserWarning)

import sys

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)

In [8]:
from data.data_fetcher import get_stock_df
from feature.label import compute_labels
from feature.feature import create_batch_feature
from model.train import split_train_test_data
from config.config import label_names

In [3]:
start_date = "2023-04-01"
end_date = "2025-03-31"
csv_file = f"../data/dataset/stock_training_{start_date}_{end_date}.csv"
if not os.path.exists(csv_file):
    raise FileNotFoundError(
        f"Please run data_fetcher.py to download the data first.")
else:
    df_all = pd.read_csv(csv_file)
    df_all['Date'] = pd.to_datetime(df_all['Date'])
    df_all.set_index('Date', inplace=True)
    df_all.index = df_all.index.date

stocks = df_all['stock'].unique()

In [6]:
all_features, all_labels, all_dates = None, None, None
for i, stock in enumerate(stocks):
    print(">>>>>>stock: ", stock)
    try:
        df = get_stock_df(df_all, stock)
        # create labels and add them into the dataframe
        df = compute_labels(df)
        features, labels, dates = create_batch_feature(df)
        if np.isnan(features).any() or np.isnan(labels).any():
            print(f"NaN detected in {stock}")
            continue
        if np.isinf(features).any() or np.isinf(labels).any():
            print(f"INF detected in {stock}")
            continue
    except:
        print(f"Error in processing {stock}")
        continue
    if all_features is None:
        all_features, all_labels, all_dates = features, labels, dates
    else:
        all_features = np.concatenate((all_features, features), axis=0)
        all_labels = np.concatenate((all_labels, labels), axis=0)
        all_dates = np.concatenate((all_dates, dates))
print("total # of data samples: ", all_features.shape[0])

>>>>>>stock:  HD
daily change percentile:  0.03
>>>>>>stock:  MTB
daily change percentile:  0.05
>>>>>>stock:  SLB
daily change percentile:  0.04
>>>>>>stock:  CZR
daily change percentile:  0.07
>>>>>>stock:  MELI
daily change percentile:  0.06
>>>>>>stock:  CHD
daily change percentile:  0.03
>>>>>>stock:  ULTA
daily change percentile:  0.05
>>>>>>stock:  DGX
daily change percentile:  0.04
>>>>>>stock:  PLTR
daily change percentile:  0.10
>>>>>>stock:  FDS
daily change percentile:  0.03
>>>>>>stock:  MAR
daily change percentile:  0.03
>>>>>>stock:  STE
daily change percentile:  0.03
>>>>>>stock:  CHRW
daily change percentile:  0.04
>>>>>>stock:  DOW
daily change percentile:  0.04
>>>>>>stock:  HLT
daily change percentile:  0.03
>>>>>>stock:  AMZN
daily change percentile:  0.04
>>>>>>stock:  ADI
daily change percentile:  0.04
>>>>>>stock:  CSX
daily change percentile:  0.03
>>>>>>stock:  EXE
daily change percentile:  0.04
>>>>>>stock:  K
daily change percentile:  0.03
>>>>>>stock:  KR
d

In [9]:
train_loader, test_dataset, idx_test = split_train_test_data(
    all_features, all_labels, batch_size=128)

sample size before: 131096, after: 131096
sample size before: 32774, after: 32774


In [15]:
df_all_labels = pd.DataFrame(all_labels, columns = label_names)

In [23]:
for col in df_all_labels.columns:
    print(f'Column: {col}')
    print(df_all_labels[col].value_counts())

Column: trend_5days
0.0    64997
1.0    52419
2.0    46454
Name: trend_5days, dtype: int64
Column: trend_10days
0.0    93885
1.0    38195
2.0    31790
Name: trend_10days, dtype: int64
Column: trend_20days
0.0    116647
1.0     26942
2.0     20281
Name: trend_20days, dtype: int64
Column: trend_30days
0.0    124037
1.0     23459
2.0     16374
Name: trend_30days, dtype: int64
