In [1]:
import pandas as pd
import os
import string
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import string
from concurrent.futures import ProcessPoolExecutor
from time import time
from numpy.lib.stride_tricks import sliding_window_view
from collections import defaultdict
from sklearn.model_selection import GroupShuffleSplit
import warnings


In [2]:

warnings.filterwarnings("ignore", category=RuntimeWarning)

base_dir = '/kaggle/input/stock-data'

stockdata_list = []
for path in os.listdir(base_dir):
    stockdata_list.append(os.path.join(base_dir, path))
    stockdata_list = sorted(stockdata_list, key = lambda x: int(x.split('_')[-1].split('.')[0]))

stock_df_list = []
for path in stockdata_list:
    file_df = pd.read_csv(path, encoding = 'utf-8-sig', index_col = 0)
    stock_df_list.append(file_df)


In [3]:


def process_df(df):
    final_group_list = []
    df['일자'] = pd.to_datetime(df['일자'], format='%Y-%m-%d')
    numeric_cols = ['시가', '고가', '저가', '현재가', '거래량']
    for col in numeric_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna(0).astype(float)

    grouped_dfs = [group for _, group in df.groupby('ID')]
    for group in grouped_dfs:
            
        group.dropna(inplace= True)
        group = group.sort_values(by='일자', ascending=True).copy()
        group.reset_index(drop=True, inplace=True)
    
    
        group['End Change'] = (group['현재가'] - group['현재가'].shift(1)) / group['현재가'].shift(1) * 100
        group.dropna(subset= ['End Change'], inplace = True)
        group.reset_index(drop=True, inplace=True)
        condition_1 = (group['End Change'] > 29) & (group['End Change'] <= 31)
        condition_2 = (
                    (group['고가'].shift(-1) >= group['현재가'] * 1.29) | 
                    (group['고가'].shift(-2) >= group['현재가'] * 1.29)
                )
        valid_rows = condition_1 & condition_2
        valid_indices = group.index[valid_rows]
        start_indices = np.maximum(0, valid_indices - 35)
        end_indices = np.minimum(valid_indices + 25, len(df))
        
        filtered_list = [
            group.iloc[start:end]
            for start, end in zip(start_indices, end_indices)
        ]
    
        if filtered_list:
            grouped_df = pd.concat(filtered_list, axis=0).drop_duplicates()
            grouped_df = grouped_df.sort_values(by='일자', ascending=True).reset_index(drop=True)
        else:
            grouped_df = pd.DataFrame()  
    
        
        if not grouped_df.empty:
            grouped_df['Start Change'] = (grouped_df['시가'] - grouped_df['현재가'].shift(1)) / grouped_df['현재가'].shift(1) * 100
            grouped_df['High Change'] = (grouped_df['고가'] - grouped_df['시가']) / grouped_df['시가'] * 100
            grouped_df['Low Change'] = (grouped_df['저가'] - grouped_df['시가']) / grouped_df['시가'] * 100
            grouped_df['5 Day MA'] = grouped_df['현재가'].rolling(window=5).mean()
            grouped_df['20 Day MA'] = grouped_df['현재가'].rolling(window=20).mean()
            grouped_df['5 Day Diff'] = np.clip( (grouped_df['5 Day MA'] - grouped_df['현재가']) / grouped_df['현재가'] * 100, -30, 30 )
            grouped_df['20 Day Diff'] = np.clip( (grouped_df['20 Day MA'] - grouped_df['현재가']) / grouped_df['현재가'] * 100, -30, 30)
    
            thresholds = grouped_df['현재가'] * 1.12
            high_array = grouped_df['고가'].to_numpy()
            windows = sliding_window_view(high_array[1:], window_shape = 3)
            rolling_max = np.max(windows, axis = 1) 
            rolling_max = np.concatenate([rolling_max, [np.nan] * 3])
            grouped_df['Rolling Max'] = rolling_max
            grouped_df['Target'] = (rolling_max >= thresholds).astype(int)
        
    
            condition_1 = (grouped_df['End Change'] > 29) & (grouped_df['End Change'] <= 31)
            condition_2 = (
                                (grouped_df['고가'].shift(-1) >= grouped_df['현재가'] * 1.29) | 
                                (grouped_df['고가'].shift(-2) >= grouped_df['현재가'] * 1.29)
                            )
            valid_rows = condition_1 & condition_2
            valid_indices = grouped_df.index[valid_rows]
            start_indices = np.maximum(0, valid_indices - 5)
            end_indices = np.minimum(valid_indices + 15, len(grouped_df))
            
            subgroup_list = [
                grouped_df.iloc[start:end+1] 
                for start, end in zip(start_indices, end_indices)
            ]
        
           
        
            if len(subgroup_list) >= 2:
                i = 0
                while i < len(subgroup_list) - 1:
                    df_1 = subgroup_list[i]
                    df_2 = subgroup_list[i+1]
                    if not df_1.merge(df_2, how = 'inner').empty:
                        new_df = pd.concat([df_1, df_2], axis = 0 ).drop_duplicates()
                        subgroup_list[i] = new_df
                        subgroup_list.pop(i+1)
        
                    else:
                        i += 1
            updated_subgroup_list = []
            uppercase = list(string.ascii_uppercase)
            lowercase = list(string.ascii_lowercase)
            i, j = 0, 0
            for subgroup in subgroup_list:
                subgroup = subgroup.copy()
                subgroup['Sub ID'] = f"{subgroup['ID'].iloc[0]}-{uppercase[i]}-{lowercase[j]}"
                updated_subgroup_list.append(subgroup)
                i += 1
                if i == 25:
                    i = 0
                    j += 1
        
        
            final_df = pd.concat(updated_subgroup_list, axis=0) if updated_subgroup_list else pd.DataFrame()
            final_df = final_df[final_df['일자'] >= pd.to_datetime('20150701')]
            final_df.reset_index(drop=True, inplace=True)
            '''
            subgrouped_dfs = []
            for sub_id, group in final_df.groupby('Sub ID'): 
                group = group.copy()  
                ss_scaler = StandardScaler()  
                group['Trade Amount'] = ss_scaler.fit_transform(group[['거래량']]) * 10  
                subgrouped_dfs.append(group)  
                
            final_df = pd.concat(subgrouped_dfs, axis=0)
            '''
            columns_to_round = ['End Change', 'Start Change', 'High Change', 'Low Change', '5 Day Diff', '20 Day Diff']
            final_df.loc[:, columns_to_round] = final_df.loc[:, columns_to_round].round(2)
            final_df.drop(columns = ['Rolling Max','시가', '고가','저가','현재가', 'ID', '5 Day MA', '20 Day MA'], inplace = True)
            final_group_list.append(final_df)
    return final_group_list




In [4]:
def process_df_parallel(stock_df_list):
    with ProcessPoolExecutor() as executor:
        processed_df_list = list(executor.map(process_df, stock_df_list))
    return processed_df_list

processed_df_list = process_df_parallel(stock_df_list)

In [5]:
temp_list = [ item for sublist in processed_df_list for item in sublist]
scaled_df = pd.concat(temp_list, axis = 0)
na_df = scaled_df[scaled_df['Start Change'].isna()]
groups_with_na = scaled_df.groupby('Sub ID').apply(lambda group: group.isna().any().any())
groups_with_na = groups_with_na[groups_with_na].index
cleaned_df = scaled_df[~scaled_df['Sub ID'].isin(groups_with_na)]
scaled_df = cleaned_df.copy()
#test_df = scaled_df[scaled_df['일자'] >= pd.to_datetime('2024-10-01')]
#scaled_df = scaled_df[scaled_df['일자'] < pd.to_datetime('2024-10-01')]
scaled_size= scaled_df.groupby('Sub ID').size()
#test_size= test_df.groupby('Sub ID').size()
scaled_idx = scaled_size[scaled_size >= 12].index
#test_idx = test_size[test_size >= 12].index
scaled_df = scaled_df.loc[scaled_df['Sub ID'].isin(scaled_idx)]
#test_df = test_df.loc[test_df['Sub ID'].isin(test_idx)]
#scaled_df.drop(columns = ['일자', '종목'], inplace = True)
#dropped_test_df = test_df.drop(columns = ['일자', '종목'])

  groups_with_na = scaled_df.groupby('Sub ID').apply(lambda group: group.isna().any().any())


In [6]:
print(len(scaled_df['Sub ID'].unique()))
print(scaled_df['Target'].value_counts())


1282
Target
0    14828
1    13580
Name: count, dtype: int64


In [7]:
scaled_df.to_csv('what I need.csv', encoding = 'utf-8-sig')

In [58]:
def generate_grouped_sequences(df, group_col, target_col, min_seq_length=7):
    sequences = []
    grouped = df.groupby(group_col)

    for sub_id, group in grouped:
        group = group.reset_index(drop=True)

        # 최소 길이가 충족되면 한 번은 반드시 실행
        for seq_end in range(min_seq_length, len(group) + 1):  # ✅ +1 추가로 최소 길이도 포함
            seq = group.iloc[seq_end - min_seq_length:seq_end].drop(columns=[group_col, target_col]).copy()

            if '거래량' in seq.columns:
                ss_scaler = StandardScaler()
                seq['Trade Amount'] = ss_scaler.fit_transform(seq[['거래량']]) * 10  
                seq['Trade Amount'] = seq['Trade Amount'].round(2)
                seq.drop(columns=['거래량'], inplace=True)

            seq = seq.to_numpy(dtype=np.float32)

            target = group.iloc[seq_end - 1][target_col]  # ✅ 올바른 target 위치 지정

            sequences.append((seq, target, sub_id))

    return sequences



def split_data_by_group(sequences, group_col):

    groups = [seq[2] for seq in sequences] 
    gss = GroupShuffleSplit(test_size = 0.2, n_splits =1, random_state = 42)

    splits = []

    for train_indices, val_indices in gss.split(sequences, groups = groups):
        train_data = [sequences[i] for i in train_indices]
        val_data = [sequences[i] for i in val_indices]
        splits.append((train_data, val_data))
    
    return splits


class SubIDGroupedDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq, target, sub_id = self.data[idx]
        return torch.tensor(seq, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)


def collate_fn(batch):
    inputs, targets = zip(*batch)
    padded_inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    targets = torch.stack(targets)
    return padded_inputs, targets


class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout=0.3):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # (batch_size, seq_length, hidden_dim)
        x = lstm_out[:, -1, :]  # 마지막 시퀀스만 사용
        x = self.fc(x)  # 최종 출력
        return x


class FocalLoss(nn.Module):
    def __init__(self, alpha=2, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, outputs, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(outputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()


In [59]:
sequences = generate_grouped_sequences(
    scaled_df, group_col='Sub ID', target_col='Target', min_seq_length=7
)
splits = split_data_by_group(sequences, group_col = 'Sub ID')


In [60]:
train_data, val_data = splits[0]
train_dataset = SubIDGroupedDataset(train_data)
val_dataset = SubIDGroupedDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

input_dim = scaled_df.shape[1] - 2 
hidden_dim = 512
num_layers = 5
output_dim = 1  # 이진 분류

model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim, dropout=0)

# 손실 함수 및 옵티마이저 설정
criterion = FocalLoss() 
optimizer = optim.Adam(model.parameters(), lr=1e-4 )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 15
model_save_dir = "saved_models_1"
os.makedirs(model_save_dir, exist_ok=True)  # 모델 저장 폴더 생성

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        targets = targets.float()

        optimizer.zero_grad()
        outputs = model(inputs).squeeze(1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # 검증
    model.eval()
    total_val_loss = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs).squeeze(1)
            loss = criterion(outputs, targets)
            total_val_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.65).cpu().numpy()
            all_preds.extend(preds)
            all_targets.extend(targets.cpu().numpy())

    precision = precision_score(all_targets, all_preds, zero_division=1)
    recall = recall_score(all_targets, all_preds, zero_division=1)
    scheduler.step(total_val_loss)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {total_train_loss:.4f}, "
          f"Val Loss: {total_val_loss:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # 모델 가중치 저장
    model_save_path = os.path.join(model_save_dir, f"model_epoch_{epoch+1}.pth")
    torch.save(model.state_dict(), model_save_path)

print("\n===== 모든 에포크 모델 저장 완료 =====")




Epoch 1/15, Train Loss: 156.5791, Val Loss: 38.3304, Precision: 1.0000, Recall: 0.0000
Epoch 2/15, Train Loss: 151.1154, Val Loss: 37.6584, Precision: 0.8947, Recall: 0.0120
Epoch 3/15, Train Loss: 149.7340, Val Loss: 37.4354, Precision: 0.8791, Recall: 0.0565
Epoch 4/15, Train Loss: 148.7586, Val Loss: 37.3810, Precision: 1.0000, Recall: 0.0014
Epoch 5/15, Train Loss: 148.3853, Val Loss: 37.5712, Precision: 0.9000, Recall: 0.0381
Epoch 6/15, Train Loss: 147.4862, Val Loss: 37.8461, Precision: 0.8788, Recall: 0.0614
Epoch 7/15, Train Loss: 146.7066, Val Loss: 37.6591, Precision: 0.8519, Recall: 0.0974
Epoch 8/15, Train Loss: 144.9141, Val Loss: 37.4829, Precision: 0.8692, Recall: 0.0797
Epoch 9/15, Train Loss: 143.7939, Val Loss: 37.5530, Precision: 0.8661, Recall: 0.0776
Epoch 10/15, Train Loss: 142.8494, Val Loss: 38.1004, Precision: 0.8509, Recall: 0.0685
Epoch 11/15, Train Loss: 140.6478, Val Loss: 38.6033, Precision: 0.7460, Recall: 0.0995
Epoch 12/15, Train Loss: 139.4359, Val Lo

In [61]:
train_data, val_data = splits[0]
train_dataset = SubIDGroupedDataset(train_data)
val_dataset = SubIDGroupedDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

input_dim = scaled_df.shape[1] - 2 
hidden_dim =128
num_layers =4
output_dim = 1  # 이진 분류

model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim, dropout=0.2)

# 손실 함수 및 옵티마이저 설정
criterion = FocalLoss() 
optimizer = optim.Adam(model.parameters(), lr=1e-4 *2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 15
model_save_dir = "saved_models_2"
os.makedirs(model_save_dir, exist_ok=True)  # 모델 저장 폴더 생성

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        targets = targets.float()

        optimizer.zero_grad()
        outputs = model(inputs).squeeze(1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # 검증
    model.eval()
    total_val_loss = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs).squeeze(1)
            loss = criterion(outputs, targets)
            total_val_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.65).cpu().numpy()
            all_preds.extend(preds)
            all_targets.extend(targets.cpu().numpy())

    precision = precision_score(all_targets, all_preds, zero_division=1)
    recall = recall_score(all_targets, all_preds, zero_division=1)
    scheduler.step(total_val_loss)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {total_train_loss:.4f}, "
          f"Val Loss: {total_val_loss:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # 모델 가중치 저장
    model_save_path = os.path.join(model_save_dir, f"model_epoch_{epoch+1}.pth")
    torch.save(model.state_dict(), model_save_path)

print("\n===== 모든 에포크 모델 저장 완료 =====")





Epoch 1/15, Train Loss: 309.9163, Val Loss: 75.2983, Precision: 0.8302, Recall: 0.0311
Epoch 2/15, Train Loss: 300.7168, Val Loss: 75.2464, Precision: 0.8846, Recall: 0.0487
Epoch 3/15, Train Loss: 298.7589, Val Loss: 74.8806, Precision: 0.8679, Recall: 0.0325
Epoch 4/15, Train Loss: 296.6977, Val Loss: 74.3986, Precision: 0.9348, Recall: 0.0303
Epoch 5/15, Train Loss: 294.7629, Val Loss: 75.4754, Precision: 0.8417, Recall: 0.0713
Epoch 6/15, Train Loss: 293.0445, Val Loss: 75.0407, Precision: 0.8852, Recall: 0.0381
Epoch 7/15, Train Loss: 290.5242, Val Loss: 75.8473, Precision: 0.8750, Recall: 0.0148
Epoch 8/15, Train Loss: 286.9823, Val Loss: 76.0934, Precision: 0.7698, Recall: 0.0755
Epoch 9/15, Train Loss: 285.0146, Val Loss: 76.5017, Precision: 0.8182, Recall: 0.0572
Epoch 10/15, Train Loss: 283.4439, Val Loss: 76.5853, Precision: 0.7610, Recall: 0.0854
Epoch 11/15, Train Loss: 280.5492, Val Loss: 76.7328, Precision: 0.8049, Recall: 0.0699
Epoch 12/15, Train Loss: 278.2431, Val Lo

In [64]:
scaled_df

Unnamed: 0,거래량,End Change,Start Change,High Change,Low Change,5 Day Diff,20 Day Diff,Target,Sub ID
0,356243.0,-2.94,-0.32,1.37,-2.95,6.80,23.85,1,373110-A-a
1,215563.0,0.76,0.00,2.71,-0.33,3.42,20.78,1,373110-A-a
2,2638761.0,10.11,0.00,15.70,0.00,-6.07,8.25,1,373110-A-a
3,672884.0,-0.98,1.56,2.88,-5.77,-4.50,7.63,1,373110-A-a
4,255844.0,-2.96,-0.99,1.39,-2.39,-0.91,9.35,1,373110-A-a
...,...,...,...,...,...,...,...,...,...
16,39306.0,0.48,-0.94,2.39,-1.91,1.70,11.70,0,2420-A-a
17,33877.0,2.83,-0.47,4.27,-1.42,-1.28,7.96,0,2420-A-a
18,25727.0,-2.29,0.46,0.00,-4.11,0.28,9.69,0,2420-A-a
19,53288.0,-1.88,0.00,0.46,-4.23,1.72,10.88,0,2420-A-a


In [62]:
dropped_test_df.reset_index(drop = True, inplace = True)
def generate_grouped_sequences(df, group_col, target_col, min_seq_length=7):
    sequences = []
    indices = []  # ✅ 예측할 데이터의 원본 인덱스 저장
    
    df = df.copy()  # 원본 DataFrame 유지
    df["original_index"] = df.index  # ✅ 원래 인덱스를 새로운 열로 저장
    
    grouped = df.groupby(group_col)

    for sub_id, group in grouped:
        group = group.reset_index(drop=True)  # ✅ 여기서 reset_index() 하면 original_index는 유지됨
        
        if len(group) >= min_seq_length:
            for seq_end in range(min_seq_length, len(group) + 1):  
                # ✅ 필요 없는 컬럼 제외하고, 정확하게 7개의 Feature만 사용
                if '거래량' in group.columns:
                    ss_scaler = StandardScaler()
                    group['Trade Amount'] = ss_scaler.fit_transform(group[['거래량']]) * 10  
                    group['Trade Amount'] = group['Trade Amount'].round(2)

                feature_cols = ['End Change', 'Start Change', 'High Change', 'Low Change', '5 Day Diff', '20 Day Diff', 'Trade Amount']
                seq = group.iloc[seq_end - min_seq_length:seq_end][feature_cols].copy()
                seq = seq.to_numpy(dtype=np.float32)

                target = group.iloc[seq_end - 1][target_col]  
                original_index = group.iloc[seq_end - 1]['original_index']  # ✅ 원래 dropped_test_df의 인덱스를 가져옴

                sequences.append((seq, target, sub_id))
                indices.append(original_index)

    # ✅ input_dim을 강제적으로 7로 맞추었는지 확인
    input_dim = sequences[0][0].shape[-1] if sequences else 0
    print(f"🔹 최종 입력 데이터 차원 (input_dim): {input_dim}")

    return sequences, indices, input_dim




# ✅ 2. 테스트 데이터셋 준비 (수정된 `generate_grouped_sequences` 적용)
sequences, indices, input_dim = generate_grouped_sequences(dropped_test_df, group_col='Sub ID', target_col='Target', min_seq_length=7)
test_dataset = SubIDGroupedDataset(sequences)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

input_dim = 7  # ✅ 저장된 모델과 일치하도록 변경
hidden_dim =128
num_layers =4 #✅ 저장된 모델과 일치하도록 변경
output_dim = 1  

model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim, dropout=0.2).to(device)

# ✅ 4. 모델 가중치 불러오기
model_path = "/kaggle/working/saved_models_2/model_epoch_9.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

# ✅ 5. 모델 평가 (예측 확률 포함)
all_preds, all_targets, all_probs = [], [], []
validation_results = []

with torch.no_grad():
    for batch in test_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs).squeeze(1)

        probs = torch.sigmoid(outputs).cpu().numpy()  # 확률 계산
        preds = (probs > 0.65).astype(int)  # 예측 결과

        all_preds.extend(preds)
        all_probs.extend(probs)
        all_targets.extend(targets.cpu().numpy())

        for true, pred, prob in zip(targets.cpu().numpy(), preds, probs):
            validation_results.append([true, pred, prob])

# ✅ 6. Precision, Recall 계산
precision = precision_score(all_targets, all_preds, zero_division=1)
recall = recall_score(all_targets, all_preds, zero_division=1)

# ✅ 7. 결과 출력
print(f"Test Precision: {precision:.4f}, Test Recall: {recall:.4f}")

# ✅ 8. 예측 결과 DataFrame 생성
test_results_df = pd.DataFrame(validation_results, columns=['Actual Target', 'Predicted Target', 'Probability'])
# ✅ 6. 예측한 데이터 위치만 업데이트 (indices 활용)
print(f"🔹 예측할 위치 개수: {len(indices)}")
print(f"🔹 예측된 데이터 개수: {len(test_results_df)}")

# ✅ 크기 불일치 확인 및 예측값 보정
if len(indices) != len(test_results_df):
    print("⚠️ `indices`와 `test_results_df` 크기가 다릅니다! 데이터 매칭 오류 가능성 있음.")
    print(f"🔹 `indices` 크기: {len(indices)}, `test_results_df` 크기: {len(test_results_df)}")

    # ✅ 부족한 부분을 NaN으로 채움
    while len(test_results_df) < len(indices):
        test_results_df.loc[len(test_results_df)] = [None, None, None]

🔹 최종 입력 데이터 차원 (input_dim): 7
Test Precision: 0.8200, Test Recall: 0.0790
🔹 예측할 위치 개수: 1301
🔹 예측된 데이터 개수: 1301


  model.load_state_dict(torch.load(model_path, map_location=device))


In [63]:
# ✅ 1. 기존 dropped_test_df 복사 (1366개의 행 유지)
merged_test_df = dropped_test_df.copy()

# ✅ 2. 'Predicted Target'과 'Probability' 열을 NaN으로 초기화
merged_test_df['Predicted Target'] = np.nan
merged_test_df['Probability'] = np.nan

# ✅ 3. indices에 해당하는 행들만 업데이트
merged_test_df.loc[indices, ['Predicted Target', 'Probability']] = test_results_df[['Predicted Target', 'Probability']].values

cols= ['End Change', '거래량', 'Target', 'Start Change', 'High Change', 'Low Change', '5 Day Diff',	'20 Day Diff',	'Sub ID']
merged_test_df = merged_test_df.merge(test_df, on = cols, how = 'inner')


# ✅ 4. 최종 결과 확인
display(merged_test_df)

merged_test_df.to_csv('result_df.csv', encoding = 'utf-8-sig' )

Unnamed: 0,거래량,End Change,Start Change,High Change,Low Change,5 Day Diff,20 Day Diff,Target,Sub ID,Predicted Target,Probability,일자,종목
0,297786.0,-14.06,-4.17,2.07,-10.76,20.85,30.00,1,199480-A-a,,,2024-12-09,뱅크웨어글로벌
1,201581.0,8.48,-0.61,10.00,0.00,6.50,23.54,1,199480-A-a,,,2024-12-10,뱅크웨어글로벌
2,212906.0,9.50,0.00,9.50,0.00,-4.33,10.95,1,199480-A-a,,,2024-12-11,뱅크웨어글로벌
3,83053.0,-1.22,0.71,0.00,-3.75,-4.38,11.04,1,199480-A-a,,,2024-12-12,뱅크웨어글로벌
4,849076.0,15.29,-2.27,17.97,0.00,-14.27,-4.01,1,199480-A-a,,,2024-12-13,뱅크웨어글로벌
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1794,39221617.0,6.72,5.72,5.92,-4.72,-4.39,-18.96,1,1470-E-a,0.0,0.599347,2024-11-25,삼부토건
1795,59168257.0,6.04,3.40,12.25,-2.14,-7.31,-20.43,0,1470-E-a,0.0,0.487884,2024-11-26,삼부토건
1796,28058840.0,-0.80,5.05,0.38,-8.78,-2.41,-16.91,0,1470-E-a,0.0,0.419720,2024-11-27,삼부토건
1797,23920873.0,-6.95,0.24,1.05,-9.60,2.75,-7.96,0,1470-E-a,0.0,0.415517,2024-11-28,삼부토건
