# Clustering

- DATE: 2026-01-09
- DATA:
  * NAME : 2508150300_2512181300_3000x6x446.parquet
  * DATE : 2025-08-15 03:00 ~ 2025-12-18 13:00
  * COUNT : 446 TICKERS
  * CONTAINS : open, high, low, close, amount, volume
- OBJS: CRYPTO

In [1]:
# READ RESOURCE DATA
import pandas as pd
import os

file = '2508150300_2512181300_3000x6x446.parquet'
data = pd.read_parquet(
    os.path.join(os.getcwd(), f'archive/{file}'),
    engine='pyarrow'
)


In [2]:
# ADD INDICATORS
import pylabwons as lw

N_TICK = 5
N_WINDOW = 18

tester = lw.BackTester(data)
tester.calc_return(N_TICK)
tester.calc_log_return()
tester.add_typical_price()
tester.add_bollinger_band('close', window=20, std=2)
tester.add_macd('close', window_slow=26, window_fast=12, window_sign=9)
tester.add_average_true_range(window=10)
tester.add_volume_roc(window=7)
tester.add_obv_slope(window=12)
tester.add_rsi(window=9, offset=-50)

for col in [
    'open', 'high', 'low', 'close', 'tp',
    'amount', 'volume',
    'bb_lower', 'bb_lower_trend', 'bb_middle',
    'bb_upper', 'bb_upper_trend',
    'obv',
    'macd', 'macd_signal',
    'return5High', 'return5Low', 'return5Mid',
]:
    del tester[col]

test_stack = tester.stack(level=0, future_stack=True)
# tester
tester['KRW-BTC'].columns

  import pkg_resources


Index(['atr', 'bb_pct_b', 'bb_width', 'log_return', 'macd_diff', 'obv_slope',
       'return5', 'rsi', 'v_roc'],
      dtype='str')

In [3]:
# NORMALIZE, FEATURING
from sklearn.preprocessing import RobustScaler

answer = 'return5'
features = [col for col in test_stack.columns if not col == answer]
scaler = RobustScaler()
scaled = pd.DataFrame(
    columns=features,
    data=scaler.fit_transform(test_stack[features]),
    index=test_stack.index
) \
    .join(test_stack[[answer]]) \
    .dropna()

# scaled

In [4]:
from tqdm.auto import tqdm
from typing import Tuple
import numpy as np
import pandas as pd


def extract_windows(
        df:pd.DataFrame,
        ans_col:str,
        ans_val:float,
        window:int=18,
) -> Tuple[np.array, np.array, np.array]:
    """
    * Suggested by GOOGLE LLM
    :param      df : 정규화 완료된 dataframe (Architecture: Stacked; Index: (Time, Ticker))
    :param ans_col : 수익률 정답지 mask column 이름
    :param ans_val : 수익률 정답지 mask 조건 값 (>=)
    :param  window : 추출할 시간 길이 (18)
    """
    windows, final_targets, indices_for_verify = [], [], []
    indices = df[df[ans_col] >= ans_val].index
    # 대상(Ticker) 단위 그룹화
    grouped = df[[c for c in df.columns if c != ans_col]].groupby(level=1)  # Ticker 기준 그룹화


    for time, ticker in tqdm(indices):
        try:
            # 대상(ticker)의 전체 시계열 조회
            unit = grouped.get_group(ticker)

            # 현재 시점(time)의 위치(순번) 찾기
            curr = unit.index.get_loc((time, ticker))

            # 직전 {window} 데이터가 존재하는지 확인 (위치가 {window - 1}보다 커야 함)
            if curr >= window - 1:
                # {window} Sample 슬라이싱 (T-11 ~ T)
                block = unit.iloc[curr - (window - 1): curr + 1].values

                # [Option] 데이터 연속성 확인 (중간에 시간이 비어있는지 체크)
                # 만약 index가 정시 단위라면, 첫 행과 끝 행의 시간 차이가 {window - 1}인지 확인
                windows.append(block)
                final_targets.append(df.loc[(time, ticker), ans_col])
                indices_for_verify.append((time, ticker))
        except KeyError:
            continue

    return np.array(windows), np.array(final_targets), np.array(indices_for_verify)

x_train, y_train, idx = extract_windows(scaled, ans_col='return5', ans_val=0.05, window=N_WINDOW)
print(f"최종 추출된 윈도우 형태: {x_train.shape}")  # (N, 18, Feature_Count)

  0%|          | 0/26919 [00:00<?, ?it/s]

최종 추출된 윈도우 형태: (26721, 18, 8)


In [5]:
# DATA FLATTEN AND CLUSTERING
# CLUSTERING METHOD: UMAP(UNIFORM MANIFOLD APPROXIMATION AND PROJECTION) / KMEANS
# - 고차원 데이터 / 대량이므로 차원 축소 후 클러스터링 진행
# - 시간 소요 많음
from sklearn.cluster import KMeans
import umap

x_flat = x_train.reshape(x_train.shape[0], -1)

reducer = umap.UMAP(n_neighbors=30, min_dist=0.0, n_components=2, random_state=42)
x_embed = reducer.fit_transform(x_flat)

n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(x_embed)

  warn(


In [6]:
from plotly.graph_objs import Figure, Scatter

fig = Figure()
fig.update_layout(
    template="plotly_dark",
    margin=dict(t=10, b=10, l=10, r=10),
    xaxis=dict(showline=True, linecolor='black'),
    yaxis=dict(showline=True, linecolor='black'),
)
fig.add_trace(
    Scatter(
        x=x_embed[:, 0],
        y=x_embed[:, 1],
        mode='markers',
        meta=clusters,
        customdata=[f'N={n} / {ticker}@{time}' for n, (time, ticker) in enumerate(idx)],
        marker=dict(
            size=5,
            color=clusters,
            colorscale='Viridis',
            showscale=True
        ),
        # hovertemplate='cluster: %{meta} @(%{x:.2f}, %{y:.2f})<extra></extra>'
        hovertemplate='#%{meta}: %{customdata}<extra></extra>'
    )
)
fig.show()

In [7]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# 1. 서브플롯 레이아웃 설정 (2행 4열)
n_rows = 2
n_cols = 4
fig = make_subplots(
    rows=n_rows, cols=n_cols,
    subplot_titles=[f"Cluster {i}" for i in range(n_clusters)],
    vertical_spacing=0.1
)

name = 'log_return'
for i in range(n_clusters):
    cluster_samples = x_train[clusters == i]
    avg_pattern = cluster_samples[:, :, scaled.columns.get_loc(name)].mean(axis=0)
    fig.add_trace(
        go.Scatter(
            y=avg_pattern,
            mode='lines+markers',
            name=f"Cluster {i}",
            line=dict(color='royalblue'),
            showlegend=False
        ),
        row=(i // n_cols) + 1, col=(i % n_cols) + 1
    )

    fig.layout.annotations[i].text = f"Cluster {i} [n={len(cluster_samples)}]"

fig.update_annotations(font_size=10)
fig.update_layout(
    template="plotly_dark",
    height=400,
    margin=dict(t=20, b=10, l=10, r=10),
    showlegend=False
)
fig.update_xaxes(
    zerolinecolor='lightgrey',
    zerolinewidth=1,
    tickfont=dict(size=8),
    gridcolor='lightgrey'
)
fig.update_yaxes(
    tickfont=dict(size=8),
    gridcolor='lightgrey'
)

fig.show()


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# 1. 서브플롯 레이아웃 설정 (2행 4열)
user_features = ['log_return', 'bb_width', 'bb_pct_b',
                 'macd_diff', 'atr', 'obv_slope',
                 'rsi', 'v_roc']

n_rows = 2
n_cols = 4
fig = make_subplots(
    rows=n_rows, cols=n_cols,
    subplot_titles=user_features,
    vertical_spacing=0.1
)

n_cluster = 0
cluster = x_train[clusters == n_cluster]
for i, feature in enumerate(user_features):
    avg_pattern = cluster[:, :, scaled.columns.get_loc(feature)].mean(axis=0)
    if i == 0:
        # print(avg_pattern)
        print(100 * np.exp(np.cumsum(avg_pattern)))
    fig.add_trace(
        go.Scatter(
            y=avg_pattern,
            mode='lines+markers',
            line=dict(color='royalblue'),
            showlegend=False,
            hovertemplate='(%{x}, %{y})<extra></extra>'
        ),
        row=(i // n_cols) + 1, col=(i % n_cols) + 1
    )

fig.update_annotations(font_size=10)
fig.update_layout(
    template="plotly_dark",
    height=600,
    margin=dict(t=20, b=10, l=10, r=10),
    showlegend=False
)
fig.update_xaxes(
    zerolinecolor='lightgrey',
    zerolinewidth=1,
    tickfont=dict(size=8),
    gridcolor='lightgrey'
)
fig.update_yaxes(
    tickfont=dict(size=8),
    gridcolor='lightgrey'
)

fig.show()


[-0.03953428 -0.01393185 -0.01027834 -0.01579003 -0.04567368 -0.02147523
 -0.01662269  0.01205694  0.07683004  0.07202803  0.056295    0.03122872
 -0.03164423 -0.1373243  -0.03017624  0.05029605  0.13988368  0.17960537]
[ 96.12370046  94.79380527  93.82447226  92.35461602  88.23132041
  86.3567333   84.93311678  85.96334834  92.82825454  99.76117655
 105.53831895 108.88614778 105.49447562  91.95821453  89.22471221
  93.82713406 107.91434058 129.14594712]


In [28]:
N = 23769
time, ticker = idx[N]
print(f'{ticker}@{time}')

ohlcv = data[ticker]
n_target = ohlcv.index.get_loc(time)
x_start = ohlcv.index[n_target - N_WINDOW * 3]
x_window = ohlcv.index[n_target - (N_WINDOW - 1)]

tickerObj = lw.Ticker(ohlcv)
tickerObj.add_bollinger_band('close', window=20, std=2)
tickerObj.add_average_true_range(window=10)
tickerObj.add_rsi(window=9, offset=-50)

view = tickerObj.viewer
view.template = "plotly_dark"
view.margin = 10
# view.height = 700
view.update_layout(height=700)

view.add_row('bb_width')
# view.add_row('atr')
view.add_row('rsi')
view.auto_scale([x_start, time])

for n in range(len(view.data)):
    

    view.add_vrect(x0=x_window, x1=time, row=n+1, col=1, fillcolor="rgba(255, 255, 0, 0.1)", line_width=0)
view()

KRW-IOTA@2025-12-02T21:00:00
