In [None]:
!pip install python-weka-wrapper3
!pip install fastdtw
!pip install tslearn
!pip install torch
!pip install joblib
!pip install pandas
!pip install numpy
!pip install h5py
!pip install dtaidistance
!pip install matplotlib

In [None]:
%cd /JISC-Net/jiscnet

In [None]:
import pickle


variable = 'cvr' #c, v, r, cv, ...
n_cluster = '6'
init = 'trad' # kmpp, trad
trad = '6' # init가 kmeans인 경우 빈 string

if init == 'kmpp':
    trad = ''

base_path = '/SIMPC/res/BTC'
file_name = f'simpc_{variable}_{n_cluster}_18_22_30_BTC_{trad}_k{n_cluster}_l18-22_dba_{init}' 


file_path = f'{base_path}/{file_name}_centroids.pkl'
with open(file_path, 'rb') as f:
    centroid = pickle.load(f)
    print('centroids shape: ', centroid.shape) #centroids shape:  (6, 22, 3)
    
file_path = f'{base_path}/{file_name}_labels.pkl'
with open(file_path, 'rb') as f:
    labels = pickle.load(f)    #set {0, 1, 2, 3, 4, 5}
    cluster_num = len(set(labels))
    print('cluster_num: ', cluster_num)
    
file_path = f'{base_path}/{file_name}_segmentation.pkl'
with open(file_path, 'rb') as f:
    segmentation = pickle.load(f)
    print('segmentation shape: ', segmentation.shape) # segmentation shape:  (467,) -> 시작점 index
    
file_path = f'{base_path}/{file_name}_subsequences.pkl'
with open(file_path, 'rb') as f:
    subsequences = pickle.load(f)
    print('subsequence shape: ', subsequences.shape) # subsequence shape:  (467,) -> 다변량 시계열 subsequnce 
    #print(subsequence)

centroids shape:  (6, 22, 3)
cluster_num:  6
segmentation shape:  (566,)
subsequence shape:  (566,)


In [4]:
#보간 및 0-1 정규화 작업

import numpy as np
import pandas as pd
from scipy.interpolate import interp1d


def interpolate_normalize_subsequences(subsequences, target_length=100):
    """
    모든 인스턴스(68개)와 변수(3개)에 대해 시계열을 target_length(100)으로 선형 보간.

    :param subsequences: (68, 3, x) 형태의 리스트 (x는 가변 길이)
    :param target_length: 보간 후 목표 길이 (기본값 100)
    :return: (68, 3, 100) 형태의 numpy 배열
    """
    num_instances = len(subsequences)  # 인스턴스 수 (68)
    print('num_instances', num_instances)
    num_variables = len(subsequences[0])  # 변수 수 (3)
    print('num_variables',num_variables)

    # 보간 후 저장할 배열
    interpolated_data = np.zeros((num_instances, num_variables, target_length))

    for i in range(num_instances):  # 각 인스턴스 반복
        for j in range(num_variables):  # 각 변수 반복
            series = np.array(subsequences[i][j])  # 현재 시계열 데이터 (x 길이)

            # 기존 x 좌표 설정
            original_length = len(series)
            x_old = np.linspace(0, 1, original_length)  # 기존 데이터의 x 좌표
            x_new = np.linspace(0, 1, target_length)  # 새로운 길이의 x 좌표

            # 선형 보간 함수 적용
            interpolator = interp1d(x_old, series, kind='linear')
            interpolated_series = interpolator(x_new)

            # 정규화 (Min-Max Scaling)
            min_val = np.min(interpolated_series)
            max_val = np.max(interpolated_series)
            if max_val - min_val == 0:
                normalized_series = np.zeros_like(interpolated_series)  # 값이 모두 같으면 0으로 채움
            else:
                normalized_series = (interpolated_series - min_val) / (max_val - min_val)

            interpolated_data[i, j] = normalized_series
    return interpolated_data


target_length = 100  # 선형보간작업 얼마나 할건지 
subsequences_list = [arr.T for arr in subsequences]
interpolated_result = interpolate_normalize_subsequences(subsequences_list, target_length)
print("Interpolated Data Shape:", interpolated_result.shape)  # (68, 3, 100)

num_instances 566
num_variables 3
Interpolated Data Shape: (566, 3, 100)


In [5]:
#test/train split
from sklearn.model_selection import train_test_split

data = {'Label': labels, 'Subsequence': pd.Series(interpolated_result.tolist())}
df = pd.DataFrame(data)
#print(df)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Label'])

# 결과 확인
print("Train label distribution:\n", train_df['Label'].value_counts(normalize=False))
print("Test label distribution:\n", test_df['Label'].value_counts(normalize=False))

train = np.array([row_i for row_i in train_df['Subsequence']])
test = np.array([row_i for row_i in test_df['Subsequence']])
train_labels = train_df['Label'].to_numpy()
test_labels = test_df['Label'].to_numpy()

Train label distribution:
 Label
1    93
0    85
5    73
3    69
4    66
2    66
Name: count, dtype: int64
Test label distribution:
 Label
1    24
0    21
3    18
5    18
4    17
2    16
Name: count, dtype: int64


In [6]:
print("Shape of train:", train.shape)
print("Shape of train_labels:", train_labels.shape)
print("Shape of test:", test.shape)
print("Shape of test_labels:", test_labels.shape)

Shape of train: (452, 3, 100)
Shape of train_labels: (452,)
Shape of test: (114, 3, 100)
Shape of test_labels: (114,)


In [None]:
!nvidia-smi

In [None]:
import os
import json
import torch
import timeit
import numpy as np
import wrappers
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# 명령행 인수 대신 직접 설정
class Args:
    save_path = '/JISC-Net/jiscnet/result/BTC_6_out64'     # 모델을 저장할 경로
    hyper ='/JISC-Net/jiscnet/default_parameters.json'  # 하이퍼파라미터 파일 경로
    load = False                # 모델을 로드할지 여부
    fit_classifier = True      # 분류기를 학습할지 여부



def fit_parameters(file, train, train_labels, test, test_labels, save_path, cluster_num,
                        save_memory=False):
    """
    Creates a classifier from the given set of parameters in the input
    file, fits it and return it.

    @param file Path of a file containing a set of hyperparemeters.
    @param train Training set.
    @param train_labels Labels for the training set.
    @param cuda If True, enables computations on the GPU.
    @param gpu GPU to use if CUDA is enabled.
    @param save_memory If True, save GPU memory by propagating gradients after
           each loss term, instead of doing it after computing the whole loss.
    """
    classifier = wrappers.CausalCNNEncoderClassifier()

    # Loads a given set of parameters and fits a model with those
    hf = open(os.path.join(file), 'r')
    params = json.load(hf)
    hf.close()
    print('params: ', params)
    params['in_channels'] = train.shape[1]  #변수 갯수
    classifier.set_params(**params)
    return classifier.fit(
        train, train_labels, test, test_labels, save_path, cluster_num, save_memory=save_memory, verbose=True
    )
    
    

if __name__ == '__main__':
    start = timeit.default_timer()
    args = Args()
    
    if torch.cuda.is_available():
        print("CUDA is available, proceeding with it...")
    else:
        print("CUDA is not available, proceeding without it...")
        args.cuda = False
        

    if not args.load and args.fit_classifier: #모델 학습, 분류기 학습 
        classifier = fit_parameters(
            args.hyper, train, train_labels, test, test_labels, args.save_path, cluster_num,
            save_memory=False
        )
    else:
        classifier = wrappers.CausalCNNEncoderClassifier()
        hf = open('/JISC-Net/jiscnet/default_parameters.json', 'r')
        hp_dict = json.load(hf)
        hf.close()
        classifier.set_params(**hp_dict)
        classifier.load(os.path.join(args.save_path, args.dataset))


    end = timeit.default_timer() 
    print("All time: ", (end- start)/60)

In [None]:
import os
import re

folder_path = "/pic/STOCK"
ticker_set = set()

# 정규표현식: plot_숫자_문자열_...
pattern = re.compile(r"plot_\d+_([A-Z]+)_")

for filename in os.listdir(folder_path):
    match = pattern.match(filename)
    if match:
        ticker = match.group(1)
        ticker_set.add(ticker)

print(f"Ticker 개수: {len(ticker_set)}")

Ticker 개수: 494
