In [1]:
import pandas as pd
from os import path

from regimetry.config import Config
from regimetry.logger_manager import LoggerManager


In [2]:

logging = LoggerManager.get_logger("positional_encoding.ipynb")

In [3]:
cfg = Config()
print("PROJECT ROOT:", cfg.PROJECT_ROOT)
print("BASE DIR:", cfg.BASE_DIR)
print("RAW DATA:", cfg.RAW_DATA_DIR)
print("PROCESSED DATA:", cfg.PROCESSED_DATA_DIR)
print("EMBEDDINGS DATA:", cfg.EMBEDDINGS_DIR) 
print("REPORTS DATA:", cfg.REPORTS_DIR) 

PROJECT ROOT: /Users/kenneth/Public/projects/python/ai/regimetry
BASE DIR: /Users/kenneth/Public/projects/python/ai/regimetry/artifacts
RAW DATA: /Users/kenneth/Public/projects/python/ai/regimetry/artifacts/data/raw
PROCESSED DATA: /Users/kenneth/Public/projects/python/ai/regimetry/artifacts/data/processed
EMBEDDINGS DATA: /Users/kenneth/Public/projects/python/ai/regimetry/artifacts/embeddings
REPORTS DATA: /Users/kenneth/Public/projects/python/ai/regimetry/artifacts/reports


## 🧩 Step 1: Load + Compute Per-Cluster Means

### 📥 Load:

In [4]:
cluster_path = path.join(cfg.REPORTS_DIR,"CAD_CHF_ws5_learnable80_nc12", "cluster_assignments.csv")
df = pd.read_csv(cluster_path)
df["Cluster_ID"] = pd.to_numeric(df["Cluster_ID"], errors="coerce")
df = df.dropna(subset=["Cluster_ID"]).copy()
df["Cluster_ID"] = df["Cluster_ID"].astype(int)


### 📊 Group + Aggregate:

In [5]:
# Find all object or categorical-type columns
cat_columns = df.select_dtypes(include=["object", "category"]).columns.tolist()
print("Categorical columns detected:", cat_columns)

Categorical columns detected: ['ML_Trade_Direction', 'ML_Signal_Quality', 'Prevailing_Trend', 'Baseline_Aligned', 'Trend_Agreement', 'Entry_Trigger', 'Entry_Confirmed', 'RHD_Bull', 'RHD_Bear']


In [6]:
df[cat_columns]

Unnamed: 0,ML_Trade_Direction,ML_Signal_Quality,Prevailing_Trend,Baseline_Aligned,Trend_Agreement,Entry_Trigger,Entry_Confirmed,RHD_Bull,RHD_Bear
4,Sell,Weak,Bearish,Flat,Flat,Flat,Flat,Strong,Strong
5,Buy,Strong,Flat,Flat,Flat,Flat,Flat,Strong,Strong
6,Flat,Moderate,Flat,Flat,Flat,Flat,Flat,Strong,Weak
7,Buy,Strong,Bullish,Flat,Flat,Flat,Flat,Strong,Strong
8,Flat,Weak,Bullish,Bullish,Bullish,Bullish,Flat,Strong,Strong
...,...,...,...,...,...,...,...,...,...
1869,Buy,Strong,Bullish,Flat,Flat,Flat,Flat,Strong,Strong
1870,Sell,Strong,Bullish,Bullish,Bullish,Bullish,Bullish,Strong,Strong
1871,Flat,Weak,Flat,Flat,Flat,Flat,Flat,Strong,Strong
1872,Sell,Strong,Flat,Flat,Flat,Flat,Flat,Weak,Strong


In [7]:
cat_df = df[["Cluster_ID"] + cat_columns]
cat_modes = cat_df.groupby("Cluster_ID").agg(lambda x: x.value_counts().idxmax()).add_suffix("_mode").reset_index()
cat_dominance = cat_df.groupby("Cluster_ID").agg(lambda x: x.value_counts(normalize=True).iloc[0]).add_suffix("_dominance").reset_index()

cat_summary = cat_modes.merge(cat_dominance, on="Cluster_ID")

In [8]:
cat_summary

Unnamed: 0,Cluster_ID,ML_Trade_Direction_mode,ML_Signal_Quality_mode,Prevailing_Trend_mode,Baseline_Aligned_mode,Trend_Agreement_mode,Entry_Trigger_mode,Entry_Confirmed_mode,RHD_Bull_mode,RHD_Bear_mode,ML_Trade_Direction_dominance,ML_Signal_Quality_dominance,Prevailing_Trend_dominance,Baseline_Aligned_dominance,Trend_Agreement_dominance,Entry_Trigger_dominance,Entry_Confirmed_dominance,RHD_Bull_dominance,RHD_Bear_dominance
0,0,Flat,Strong,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.448161,0.431438,0.488294,0.668896,0.769231,0.83612,0.963211,0.866221,0.9699
1,1,Flat,Weak,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.48374,0.414634,0.556911,0.747967,0.813008,0.849593,0.96748,0.894309,0.939024
2,2,Sell,Weak,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.347826,0.478261,0.434783,0.869565,0.913043,0.913043,0.956522,0.826087,0.956522
3,3,Flat,Strong,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.444915,0.444915,0.563559,0.822034,0.872881,0.851695,0.961864,0.90678,0.932203
4,4,Flat,Strong,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.443983,0.460581,0.593361,0.751037,0.804979,0.958506,0.987552,0.995851,0.829876
5,5,Flat,Weak,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.455882,0.411765,0.566176,0.875,0.926471,0.955882,0.985294,0.926471,0.977941
6,6,Flat,Weak,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.4,0.43,0.51,0.72,0.81,0.88,1.0,0.96,0.86
7,7,Flat,Strong,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.366667,0.477778,0.577778,0.755556,0.822222,0.883333,0.95,0.888889,0.95
8,8,Flat,Weak,Bullish,Flat,Flat,Flat,Flat,Strong,Strong,0.454545,0.454545,0.454545,0.636364,0.818182,0.818182,1.0,0.818182,1.0
9,9,Flat,Weak,Flat,Flat,Flat,Flat,Flat,Strong,Strong,0.375,0.5,0.625,0.666667,0.875,0.875,0.958333,0.875,0.958333


In [9]:
# Find all strictly boolean columns
bool_columns = df.select_dtypes(include=["bool"]).columns.tolist()

# Optionally print them
print("Boolean columns detected:", bool_columns)

# Convert to float
df[bool_columns] = df[bool_columns].astype(float)

# These will show up in the .mean() aggregation as:
#  RHD_Bear_mean = 0.48 → "48% of windows had a bearish divergence"



Boolean columns detected: []


In [10]:
# Now, select all numeric columns including the newly-converted bools
df_numeric = df.select_dtypes(include=["number"])

# Regenerate the summary only on numerics
cluster_summary = df_numeric.groupby("Cluster_ID").agg(["mean", "std"]).round(4)
cluster_summary.columns = ['_'.join(col).strip() for col in cluster_summary.columns.values]
cluster_summary.reset_index(inplace=True)

In [11]:
cluster_summary

Unnamed: 0,Cluster_ID,Open_mean,Open_std,High_mean,High_std,Low_mean,Low_std,Close_mean,Close_std,Volume_mean,...,Classic_Divergence_Bull_mean,Classic_Divergence_Bull_std,Classic_Divergence_Bear_mean,Classic_Divergence_Bear_std,Day_Of_Week_mean,Day_Of_Week_std,Month_mean,Month_std,Year_mean,Year_std
0,0,0.7444,0.0133,0.7469,0.0131,0.7419,0.0134,0.7445,0.0133,23815.0234,...,0.1672,0.3738,0.0334,0.1801,2.4013,2.0397,7.1806,3.3729,2019.5251,1.2778
1,1,0.6398,0.0171,0.642,0.017,0.6374,0.0173,0.6397,0.0172,40736.1585,...,0.1382,0.3458,0.061,0.2398,2.3943,2.0906,8.1423,3.3734,2023.7114,0.6955
2,2,0.7297,0.0044,0.7326,0.0043,0.7269,0.0045,0.7296,0.0045,128983.5217,...,0.1739,0.3876,0.1304,0.3444,2.2174,2.044,10.3043,0.4705,2022.0,0.0
3,3,0.7466,0.014,0.7491,0.0139,0.744,0.0139,0.7466,0.0139,24610.6822,...,0.0932,0.2914,0.1356,0.3431,2.4576,2.1304,5.7712,3.2444,2019.678,1.3738
4,4,0.743,0.0138,0.7455,0.0137,0.74,0.014,0.7426,0.0139,29557.0747,...,0.0083,0.0909,0.1701,0.3765,2.3776,2.0255,6.2282,3.3244,2019.6349,1.3962
5,5,0.6508,0.0177,0.6529,0.0176,0.6491,0.0176,0.6512,0.0176,36331.0588,...,0.0735,0.262,0.0368,0.1889,2.4412,2.0322,4.6691,3.0961,2023.9632,0.6712
6,6,0.7491,0.0146,0.7524,0.0144,0.7455,0.0139,0.7489,0.0143,98109.33,...,0.06,0.2387,0.15,0.3589,2.4,2.0695,5.39,1.9535,2022.0,0.0
7,7,0.6931,0.007,0.6955,0.0072,0.6906,0.0068,0.6931,0.0072,18749.7556,...,0.1222,0.3285,0.0611,0.2402,2.4333,2.077,7.8722,3.3006,2020.25,0.6591
8,8,0.7212,0.0047,0.725,0.004,0.7172,0.0038,0.7215,0.0049,137175.0,...,0.1818,0.4045,0.0,0.0,2.7273,2.3277,9.5455,0.5222,2022.0,0.0
9,9,0.5955,0.0051,0.5985,0.0046,0.5927,0.0049,0.596,0.0048,43563.1667,...,0.125,0.3378,0.0833,0.2823,2.25,2.0054,4.4583,0.509,2025.0,0.0


In [12]:
# Merge numeric and categorical summaries by Cluster_ID
full_cluster_profile = cluster_summary.merge(cat_summary, on="Cluster_ID")

In [13]:
# Sort by cluster ID for clarity
full_cluster_profile = full_cluster_profile.sort_values("Cluster_ID")

# Preview
full_cluster_profile.head()

Unnamed: 0,Cluster_ID,Open_mean,Open_std,High_mean,High_std,Low_mean,Low_std,Close_mean,Close_std,Volume_mean,...,RHD_Bear_mode,ML_Trade_Direction_dominance,ML_Signal_Quality_dominance,Prevailing_Trend_dominance,Baseline_Aligned_dominance,Trend_Agreement_dominance,Entry_Trigger_dominance,Entry_Confirmed_dominance,RHD_Bull_dominance,RHD_Bear_dominance
0,0,0.7444,0.0133,0.7469,0.0131,0.7419,0.0134,0.7445,0.0133,23815.0234,...,Strong,0.448161,0.431438,0.488294,0.668896,0.769231,0.83612,0.963211,0.866221,0.9699
1,1,0.6398,0.0171,0.642,0.017,0.6374,0.0173,0.6397,0.0172,40736.1585,...,Strong,0.48374,0.414634,0.556911,0.747967,0.813008,0.849593,0.96748,0.894309,0.939024
2,2,0.7297,0.0044,0.7326,0.0043,0.7269,0.0045,0.7296,0.0045,128983.5217,...,Strong,0.347826,0.478261,0.434783,0.869565,0.913043,0.913043,0.956522,0.826087,0.956522
3,3,0.7466,0.014,0.7491,0.0139,0.744,0.0139,0.7466,0.0139,24610.6822,...,Strong,0.444915,0.444915,0.563559,0.822034,0.872881,0.851695,0.961864,0.90678,0.932203
4,4,0.743,0.0138,0.7455,0.0137,0.74,0.014,0.7426,0.0139,29557.0747,...,Strong,0.443983,0.460581,0.593361,0.751037,0.804979,0.958506,0.987552,0.995851,0.829876


In [14]:
full_cluster_profile.to_csv("full_cluster_profile")