In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", UserWarning)

from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif

os.chdir('..')

In [2]:
from data.utils import get_stock_df
from feature.label import compute_labels
from config.config import buy_sell_signals, label_names, feature_names

In [3]:
def load_data_file(csv_file):
    if not os.path.exists(csv_file):
        raise FileNotFoundError(
            f"Please run data_fetcher.py to download the data first.")
    else:
        df = pd.read_csv(csv_file)
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
    return df

start_date = "2023-04-01"
end_date = "2025-03-31"
csv_file = f"./data/dataset/stock_training_{start_date}_{end_date}.csv"
df_all = load_data_file(csv_file)

In [4]:
stocks = df_all['stock'].unique()

df_corr = None
for i, stock in enumerate(stocks):
    print(">>>>>>stock: ", stock)
    df = get_stock_df(df_all, stock)
    # create labels and add them into the dataframe
    df, daily_change_perc = compute_labels(df)
    if df_corr is None:
        df_corr = df[buy_sell_signals + label_names + ["stock"]]
    else:
        df = df[buy_sell_signals + label_names + ["stock"]]
        df_corr = pd.concat([df_corr, df], axis=0)

df_corr = df_corr.dropna()

>>>>>>stock:  HD
>>>>>>stock:  MTB
>>>>>>stock:  SLB
>>>>>>stock:  CZR
>>>>>>stock:  MELI
>>>>>>stock:  CHD
>>>>>>stock:  ULTA
>>>>>>stock:  DGX
>>>>>>stock:  PLTR
>>>>>>stock:  FDS
>>>>>>stock:  MAR
>>>>>>stock:  STE
>>>>>>stock:  CHRW
>>>>>>stock:  DOW
>>>>>>stock:  HLT
>>>>>>stock:  AMZN
>>>>>>stock:  ADI
>>>>>>stock:  CSX
>>>>>>stock:  EXE
>>>>>>stock:  K
>>>>>>stock:  KR
>>>>>>stock:  INTU
>>>>>>stock:  PYPL
>>>>>>stock:  ATO
>>>>>>stock:  MU
>>>>>>stock:  FRT
>>>>>>stock:  APD
>>>>>>stock:  DLR
>>>>>>stock:  ADBE
>>>>>>stock:  GEV
>>>>>>stock:  HUM
>>>>>>stock:  EBAY
>>>>>>stock:  WAT
>>>>>>stock:  SW
>>>>>>stock:  EMN
>>>>>>stock:  UHS
>>>>>>stock:  WST
>>>>>>stock:  APTV
>>>>>>stock:  ALGN
>>>>>>stock:  DUK
>>>>>>stock:  BX
>>>>>>stock:  BLK
>>>>>>stock:  DD
>>>>>>stock:  EG
>>>>>>stock:  AZO
>>>>>>stock:  EVRG
>>>>>>stock:  ELV
>>>>>>stock:  MNST
>>>>>>stock:  WSM
>>>>>>stock:  HSY
>>>>>>stock:  PCG
>>>>>>stock:  GILD
>>>>>>stock:  IR
>>>>>>stock:  GPC
>>>>>>stock:  VLTO
>>>>>

In [5]:
# Cramér’s V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k - 1, r - 1))

for col in buy_sell_signals:
    score = cramers_v(df_corr[col], df_corr["trend_10days"])
    print(f"{col}: {score:.3f}")


MA_5_20_Crossover_Signal: 0.156
MA_5_10_Crossover_Signal: 0.194
MA_5_50_Crossover_Signal: 0.116
MA_10_20_Crossover_Signal: 0.121
MA_20_50_Crossover_Signal: 0.070
MACD_Crossover_Signal: 0.196
VWAP_Crossover_Signal: 0.111


In [6]:
# Mutual Information
X = df_corr[buy_sell_signals]
y = df_corr["trend_10days"]

mi_scores = mutual_info_classif(X, y, discrete_features=True)

for feature, score in zip(X.columns, mi_scores):
    print(f"{feature}: {score:.3f}")

MA_5_20_Crossover_Signal: 0.012
MA_5_10_Crossover_Signal: 0.019
MA_5_50_Crossover_Signal: 0.007
MA_10_20_Crossover_Signal: 0.007
MA_20_50_Crossover_Signal: 0.003
MACD_Crossover_Signal: 0.018
VWAP_Crossover_Signal: 0.006


In [7]:
# Group Frequency Tables
for col in df_corr[buy_sell_signals]:
    print(f"\n=== {col} ===")
    print(pd.crosstab(df_corr[col], df_corr["trend_10days"], normalize="index"))



=== MA_5_20_Crossover_Signal ===
trend_10days                   0.0       1.0       2.0
MA_5_20_Crossover_Signal                              
-1                        0.854661  0.002534  0.142805
 0                        0.959012  0.023192  0.017796
 1                        0.823108  0.175606  0.001286

=== MA_5_10_Crossover_Signal ===
trend_10days                   0.0       1.0       2.0
MA_5_10_Crossover_Signal                              
-1                        0.866307  0.006124  0.127569
 0                        0.964336  0.019799  0.015865
 1                        0.824541  0.172142  0.003316

=== MA_5_50_Crossover_Signal ===
trend_10days                   0.0       1.0       2.0
MA_5_50_Crossover_Signal                              
-1                        0.856291  0.003311  0.140397
 0                        0.955748  0.024871  0.019381
 1                        0.817851  0.181157  0.000992

=== MA_10_20_Crossover_Signal ===
trend_10days                    0.0   