In [1]:
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import mplfinance as mpf
from joblib import dump, load
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

year = '2000'
n_clusters = 60
batch_size = 1000

In [2]:
k_dir = rf'/data/stock_csv_data_by_year/5-2/{year}/'
k_means_dir = rf'/data/k_means/5-2/{n_clusters}-{batch_size}/{year}/'
k_means_chart_dir = rf'/data/k_means/5-2/{n_clusters}-{batch_size}/{year}/chart/'
os.makedirs(k_dir, exist_ok=True)
os.makedirs(k_means_dir, exist_ok=True)
os.makedirs(k_means_chart_dir, exist_ok=True)

csv_files = [os.path.join(k_dir, f) for f in os.listdir(k_dir) if f.endswith('.csv')]
time_series_data = []
len(csv_files)

1057

In [4]:
with tqdm(total=len(range(len(csv_files)))) as progress_bar:
    for csv_file in csv_files:
        progress_bar.set_postfix({'csv_file': csv_file,})
        df = pd.read_csv(csv_file, usecols=['time', 'open', 'high', 'low', 'close', 'volume'])
        
        df = df[df['volume'] != 0]
        if df.empty:
            progress_bar.update(1)
            continue
        
        df = df[['time', 'open', 'high', 'low', 'close']]
        df['time'] = pd.to_datetime(df['time'], format='%Y%m%d%H%M%S%f')
    
        # 按日期排序并分组
        df.set_index('time', inplace=True)
        grouped = df.groupby(df.index.date)
    
        prev_close = None  # 存储前一天的收盘价
        for group_date, group_data in grouped:
            if prev_close is not None and prev_close != 0:
                # 计算百分比变化
                group_data[['open', 'high', 'low', 'close']] = (group_data[['open', 'high', 'low', 'close']] / prev_close) - 1
                time_series_data.append(group_data.values)
            # 更新前一天的收盘价
            prev_close = group_data['close'].iloc[-1]
        
        progress_bar.update(1)
        
time_series_data = [ts for ts in time_series_data if ts.shape[0] == 48 and not np.isnan(ts).any() and not np.isinf(ts).any()]

100%|██████████| 1057/1057 [09:15<00:00,  1.90it/s, csv_file=/data/stock_csv_data_by_year/5-2/2000/sz.001896_2000.csv]


In [5]:
# 归一化时间序列数据
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
time_series_data_scaled = scaler.fit_transform(time_series_data)
dump(scaler, os.path.join(k_means_dir, "scaler.joblib"))
dump(time_series_data_scaled, os.path.join(k_means_dir, "time_series_data_scaled.joblib"))

['/data/k_means/5-2/60-1000/2000/time_series_data_scaled.joblib']

In [3]:
time_series_data_scaled = load(os.path.join(k_means_dir, "time_series_data_scaled.joblib"))

In [4]:
model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", verbose=True, n_jobs=-1)
model.fit(time_series_data_scaled)
dump(model, os.path.join(k_means_dir, f"cluster_model_{n_clusters}_all.joblib"))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 981 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 1426 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1953 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 2560 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 3249 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 4869 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 5800 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 6813 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 7906 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 9081 tasks   

In [None]:
for i in range(0, time_series_data_scaled.shape[0], batch_size):
    print(f"{i} - {i+batch_size} / {len(time_series_data_scaled)}")
    model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", verbose=2, n_jobs=-1)
    model.fit(time_series_data_scaled[i:i+batch_size])
    
    dump(model, os.path.join(k_means_dir, f"cluster_model_{i//batch_size}_{n_clusters}_{batch_size}.joblib"))

In [47]:
# with tqdm(total=len(range(len(second_level_model2.cluster_centers_)))) as progress_bar:
#     for i in range(len(second_level_model2.cluster_centers_)):
#         df = pd.DataFrame(second_level_model2.cluster_centers_[i], columns=['open', 'high', 'low', 'close'])
#         df['Date'] = pd.date_range(start='1/1/2000', periods=48, freq='B')
#         df['volume'] = 0
#         df = df.set_index('Date')
#         mpf.plot(df, type='candle', style='charles', volume=True, figsize=(24, 16), savefig=dict(fname=os.path.join(k_means_chart_dir, f"{i}.png"), dpi=100))
#         progress_bar.update(1)

  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli