# Clustering

- DATE: 2026-01-05 ~ 2026-01-09
- DATA: 2508150300_2512181300_3000x6x446.parquet
- OBJS: CRYPTO

In [None]:
# READ RESOURCE DATA
import pandas as pd
import os

file = '2508150300_2512181300_3000x6x446.parquet'
data = pd.read_parquet(
    os.path.join(os.getcwd(), f'archive/{file}'),
    engine='pyarrow'
)
data

In [None]:
# ADD INDICATORS
import pylabwons as lw

tester = lw.BackTester(data)
tester.calc_return(5)
tester.add_bollinger_band('close', window=20, std=2)
tester.add_macd('close', window_slow=26, window_fast=12, window_sign=9)
tester.add_average_true_range(window=10)
tester.add_volume_roc(window=7)
tester.add_obv_slope(window=12)
tester.add_rsi(window=9)
test_stack = tester.stack(level=0, future_stack=True)

In [None]:
# NORMALIZE, FEATURING
from sklearn.preprocessing import RobustScaler

features = [col for col in test_stack.columns if not col.startswith('return')]
scaler = RobustScaler()
scaled = pd.DataFrame(
    columns=features,
    data=scaler.fit_transform(test_stack[features]),
    index=test_stack.index
) \
    .join(test_stack[[col for col in test_stack.columns if col.startswith('return')]]) \
    .dropna()

scaled

In [None]:
from tqdm.auto import tqdm
from typing import Tuple
import numpy as np
import pandas as pd


def extract_windows(
        df:pd.DataFrame,
        ans_col:str,
        ans_val:float,
        window:int=12,
) -> Tuple[np.array, np.array]:
    """
    * Suggested by GOOGLE LLM
    :param      df : 정규화 완료된 dataframe (Architecture: Stacked; Index: (Time, Ticker))
    :param ans_col : 수익률 정답지 mask column 이름
    :param ans_val : 수익률 정답지 mask 조건 값 (>=)
    :param  window : 추출할 시간 길이 (12)
    """
    windows, final_targets = [], []
    indices = df[df[ans_col] >= ans_val].index

    # 대상(Ticker) 단위 그룹화
    grouped = df[[c for c in df.columns if c != ans_col]].groupby(level=1)  # Ticker 기준 그룹화

    for time, ticker in tqdm(indices):
        try:
            # 대상(ticker)의 전체 시계열 조회
            unit = grouped.get_group(ticker)

            # 현재 시점(time)의 위치(순번) 찾기
            curr = unit.index.get_loc((time, ticker))

            # 직전 {window} 데이터가 존재하는지 확인 (위치가 {window - 1}보다 커야 함)
            if curr >= window - 1:
                # {window} Sample 슬라이싱 (T-11 ~ T)
                block = unit.iloc[curr - (window - 1): curr + 1].values

                # [Option] 데이터 연속성 확인 (중간에 시간이 비어있는지 체크)
                # 만약 index가 정시 단위라면, 첫 행과 끝 행의 시간 차이가 {window - 1}인지 확인
                windows.append(block)
                final_targets.append(df.loc[(time, ticker), ans_col])

        except KeyError:
            continue

    return np.array(windows), np.array(final_targets)

x_train, y_train = extract_windows(scaled, ans_col='return5High', ans_val=0.05, window=12)
print(f"최종 추출된 윈도우 형태: {x_train.shape}")  # (N, 12, Feature_Count)

In [None]:
# DATA FLATTEN AND CLUSTERING
# CLUSTERING METHOD: UMAP(UNIFORM MANIFOLD APPROXIMATION AND PROJECTION) / KMEANS
# - 고차원 데이터 / 대량이므로 차원 축소 후 클러스터링 진행
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import umap

x_flat = x_train.reshape(x_train.shape[0], -1)

reducer = umap.UMAP(n_neighbors=30, min_dist=0.0, n_components=2, random_state=42)
x_embed = reducer.fit_transform(x_flat)

n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(x_embed)

plt.figure()
plt.scatter(x_embed[:,0], x_embed[:,1], c=clusters, cmap='Spectral', s=5)
plt.colorbar()
plt.show()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

# x_train: (N, 12, Features), clusters: KMeans 결과
# 여기서 지표별 특징 확인 @name = {feature 이름}
name = 'bb_width'

for i in range(n_clusters):
    # i번 클러스터에 해당하는 샘플들만 추출
    cluster_samples = x_train[clusters == i]

    # 해당 샘플들의 평균적인 가격 흐름 계산
    avg_pattern = cluster_samples[:, :, scaled.columns.get_loc(name)].mean(axis=0)

    axes[i].plot(avg_pattern, marker='o', color='blue')
    axes[i].set_title(f"Cluster {i} Pattern (n={len(cluster_samples)})")
    axes[i].grid(True)

plt.tight_layout()
plt.show()