In [1]:
import pandas as pd
import numpy as np
import pickle
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
def create_mon_data():
    data = pd.read_csv("mon_standard_dataset.csv")
    data = data[['Direction_Size_Sequence', 'Label']]
    data['Direction_Only_Sequence'] = data['Direction_Size_Sequence'].apply(lambda x: [1 if i > 0 else -1 for i in eval(x)])
    
    # 라벨이 0~94 사이인 데이터만 필터링
    data = data[data['Label'].between(0, 94)]
    
    def adjust_sequence_length(sequence, target_length=10000, padding_value=-1):
        if isinstance(sequence, (int, float, np.float32, np.float64)):
            sequence = [sequence]
        if len(sequence) < target_length:
            sequence = sequence + [padding_value] * (target_length - len(sequence))
        else:
            sequence = sequence[:target_length]
        return sequence


    # Convert DataFrame column to NumPy array
    sequence_array = data['Direction_Only_Sequence'].to_numpy()

    # Adjust sequence lengths
    sequence_array = np.array([adjust_sequence_length(seq) for seq in sequence_array])

    # Convert data to float32 type
    sequence_array = sequence_array.astype('float32')

    # Add an extra dimension to create [Length x 1] x n shape
    sequence_array = sequence_array[:, :, np.newaxis]

    # Now sequence_array is a 3D NumPy array
    # We don't need to put it back into the DataFrame

    # Verify the shape
    print("Shape of sequence_array:", sequence_array.shape)

    # If you need to keep other columns from the original DataFrame, you can do:
    # other_columns = data.drop('Direction_Only_Sequence', axis=1)

    # And if you need to use the sequence_array later with other DataFrame operations:
    # data = pd.DataFrame({'Direction_Only_Sequence': list(sequence_array)})
    # data = pd.concat([data, other_columns], axis=1)
    
    mon_data = sequence_array
    
    # Save the processed data
    with open("mon_data.pkl", "wb") as f:
        pickle.dump(mon_data, f)

    print("Data processed and saved successfully.")
    

In [11]:
def split_data(data, labels, label_start, label_end, train_ratio=0.8):
    """
    데이터를 필터링하고 stratified sampling으로 트레인/테스트로 나누며 X, y를 분리하는 함수.

    Args:
        data (np.ndarray): 입력 데이터 (특징).
        labels (np.ndarray): 데이터의 라벨.
        label_start (int): 선택할 라벨 범위의 시작값.
        label_end (int): 선택할 라벨 범위의 끝값.
        train_ratio (float): 트레인/테스트 분할 비율 (기본값 0.8).

    Returns:
        X_train (np.ndarray): 트레인 데이터의 특징.
        y_train (np.ndarray): 트레인 데이터의 라벨.
        X_test (np.ndarray): 테스트 데이터의 특징.
        y_test (np.ndarray): 테스트 데이터의 라벨.
    """
    # 1. 원하는 라벨을 가진 데이터를 필터링
    mask = (labels >= label_start) & (labels <= label_end)
    filtered_data = data[mask]
    filtered_labels = labels[mask]

    # 2. Stratified sampling으로 테스트/트레인으로 나누기
    X_train, X_test, y_train, y_test = train_test_split(
        filtered_data, filtered_labels, 
        test_size=(1 - train_ratio), 
        random_state=42,
        stratify=filtered_labels  # 이 부분이 stratified sampling을 적용합니다
    )

    # 3. X(특징)와 y(라벨)를 분리 -> 이미 분리됨

    # 4. 반환
    return X_train, y_train, X_test, y_test
