In [2]:
import os
import glob
import random
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import warnings

# Suppress the UserWarning from pad_sequence
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.utils.rnn")


In [3]:

# --- 1. 기본 설정 및 데이터 로드 경로 ---

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [4]:

# `base` 경로는 LSTM_modeling.ipynb에 정의된 경로와 동일해야 합니다.
# 사용자의 환경에 맞춰 이 경로를 설정해주세요.
base = os.path.join(os.getcwd(), 'five_fold_utils')
base


'/root/Deep_learning_Project/Battery_model/Nature23_battery_model/five_fold_utils'

In [5]:

# 예시: 실제 .npz.npy 파일이 있는 디렉토리로 변경하세요.
model_dir = os.path.join(os.getcwd(), 'models')
os.makedirs(model_dir, exist_ok=True)


In [6]:

# all_car_dict 로드 (전체 차량 데이터)
all_car_dict = np.load(os.path.join(base, 'all_car_dict.npz.npy'),
                       allow_pickle=True).item()


In [7]:

# ind_odd_dicts 로드 (브랜드별 정상/비정상 차량 ID)
# 1,2,3번 딕셔너리 전부 로드
ind_odd_dicts = {}
for i in (1, 2, 3):
    fn = f'ind_odd_dict{i}.npz.npy'
    path = os.path.join(base, fn)
    ind_odd_dicts[i] = np.load(path, allow_pickle=True).item()

# 테스트로 내용 확인
for i, d in ind_odd_dicts.items():
    print(f"=== brand{i} ind/ood 리스트 ===")
    print("  정상 차량:", d['ind_sorted'])
    print("  비정상 차량:", d['ood_sorted'])

=== brand1 ind/ood 리스트 ===
  정상 차량: [129, 158, 152, 79, 95, 114, 34, 177, 99, 138, 163, 54, 45, 115, 66, 87, 47, 57, 31, 195, 36, 102, 72, 173, 94, 51, 92, 61, 153, 125, 103, 3, 50, 10, 7, 146, 166, 48, 75, 86, 15, 175, 64, 2, 110, 13, 23, 93, 116, 62, 8, 41, 22, 6, 24, 101, 46, 187, 198, 142, 131, 18, 160, 56, 29, 141, 148, 168, 71, 53, 104, 120, 154, 20, 17, 111, 133, 63, 35, 83, 5, 88, 159, 145, 176, 127, 77, 118, 52, 81, 121, 59, 38, 80, 109, 179, 28, 123, 44, 180, 149, 135, 164, 74, 40, 14, 65, 69, 42, 193, 12, 60, 73, 126, 161, 188, 32, 30, 170, 128, 167, 9, 155, 156, 43, 100, 33, 90, 139, 1, 112, 25, 4, 16, 189, 147, 124, 178, 55, 85, 122, 96, 162, 132, 89, 19, 27, 84, 39, 151, 67, 26, 172, 76, 37, 143, 58, 165, 97, 134, 82, 113, 137, 144, 70, 11, 117, 106]
  비정상 차량: [91, 192, 169, 130, 140, 171, 190, 186, 105, 49, 181, 157, 183, 185, 194, 98, 191, 136, 119, 196, 107, 68, 108, 78, 182, 150, 174, 21, 184, 197]
=== brand2 ind/ood 리스트 ===
  정상 차량: [214, 231, 233, 234, 218, 201, 211

In [None]:

# --- 2. 모델, 데이터셋, 유틸리티 함수 정의 (LSTM_modeling.ipynb에서 가져옴) ---

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        # Pack padded sequence
        packed_input = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # Get the output of the last time step for each sequence
        # We need to use lengths to get the correct last output
        idx = (lengths - 1).view(-1, 1).unsqueeze(2).expand(output.size(0), 1, output.size(2))
        last_output = torch.gather(output, 1, idx).squeeze(1)

        out = self.fc(last_output)
        return out

class BatteryDS(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def collate_fn(batch):
    sequences = [item[0] for item in batch]
    labels = torch.tensor([item[1] for item in batch], dtype=torch.long)

    lengths = torch.tensor([s.shape[0] for s in sequences], dtype=torch.long)

    # Pad the sequences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

    # Sort sequences by length in descending order
    lengths, sorted_idx = lengths.sort(descending=True)
    padded_sequences = padded_sequences[sorted_idx]
    labels = labels[sorted_idx]

    return padded_sequences, lengths, labels

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs, device, model_path_prefix, patience=5):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    metrics = defaultdict(list)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct_predictions = 0
        total_samples = 0

        for batch_idx, (data, lengths, labels) in enumerate(train_loader):
            data, labels = data.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(data.float(), lengths)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct_predictions / total_samples
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct_val_predictions = 0
        total_val_samples = 0
        with torch.no_grad():
            for data, lengths, labels in val_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data.float(), lengths)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_val_samples += labels.size(0)
                correct_val_predictions += (predicted == labels).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct_val_predictions / total_val_samples

        print(f'Epoch [{epoch+1}/{epochs}], '
              f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}, '
              f'Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

        metrics['train_loss'].append(avg_train_loss)
        metrics['train_accuracy'].append(train_accuracy)
        metrics['val_loss'].append(avg_val_loss)
        metrics['val_accuracy'].append(val_accuracy)

        # Early stopping and model saving
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), f'{model_path_prefix}_best.pth')
            print(f"Saved best model with Val Loss: {best_val_loss:.4f}")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
    return metrics



In [9]:
brand = 2
ind_cars = [c for c in ind_odd_dicts[brand]['ind_sorted'] if c in all_car_dict]
ood_cars = [c for c in ind_odd_dicts[brand]['ood_sorted'] if c in all_car_dict]
all_cars = ind_cars + ood_cars

file_list = sum([all_car_dict[c] for c in all_cars], [])


In [11]:
def make_folds(ind_cars, ood_cars, car_dict, K=5):
    ind_folds = np.array_split(ind_cars, K)
    ood_folds = np.array_split(ood_cars, K)
    folds = []

    for k in range(K):
        test_cars  = set(ind_folds[k].tolist() + ood_folds[k].tolist())
        train_cars = set(ind_cars + ood_cars) - test_cars

        train_files = sum((car_dict[c] for c in train_cars), [])
        test_files  = sum((car_dict[c] for c in test_cars),  [])

        folds.append({
            'train_files': train_files,
            'test_files':  test_files
        })

    return folds

In [None]:
def get_brand_file_lists(brand_idx, base_dir, seed=42, train_ratio=0.8):
    """train/test 폴더 유무에 따라 파일 리스트를 반환"""
    brand_dir = os.path.join(base_dir, f"battery_brand{brand_idx}")
    train_dir = os.path.join(brand_dir, 'train')
    test_dir  = os.path.join(brand_dir, 'test')

    if os.path.isdir(train_dir) and os.path.isdir(test_dir):
        train_files = glob(os.path.join(train_dir, '*.pkl'))
        test_files  = glob(os.path.join(test_dir,  '*.pkl'))
    else:
        data_dir = os.path.join(brand_dir, 'data')
        all_files = glob(os.path.join(data_dir, '*.pkl'))
        random.seed(seed); random.shuffle(all_files)
        cut = int(len(all_files) * train_ratio)
        train_files, test_files = all_files[:cut], all_files[cut:]

    return train_files, test_files

In [12]:

def get_folds_for_brand(brand_idx, all_car_dict, ind_odd_dicts, K=5):
    """브랜드별로 valid 차량 dict만 뽑아서 make_folds 호출"""
    d = ind_odd_dicts[brand_idx]
    ind_cars = [c for c in d['ind_sorted'] if c in all_car_dict]
    ood_cars = [c for c in d['ood_sorted'] if c in all_car_dict]
    brand_car_dict = {c: all_car_dict[c] for c in (ind_cars + ood_cars)}
    return make_folds(ind_cars, ood_cars, brand_car_dict, K)

In [13]:
folds = get_folds_for_brand(
    brand_idx=2,
    all_car_dict=all_car_dict,
    ind_odd_dicts=ind_odd_dicts,
    K=5
)

In [None]:
def load_snippets_and_labels(file_list):
    snippets = []
    labels = []
    for path in file_list:
        snippet, meta = torch.load(path, map_location='cpu')
        if not isinstance(snippet, torch.Tensor):
            snippet = torch.from_numpy(snippet).float()
        else:
            snippet = snippet.float()
        label = 0 if meta['label'] == '00' else 1
        snippets.append(snippet)
        labels.append(label)
    return snippets, labels

In [18]:
from tqdm import tqdm

In [None]:

# Brand 2 folds
brand = 2
folds = get_folds_for_brand(brand, all_car_dict, ind_odd_dicts, K=5)

fold_metrics = []
for fold_idx, fold in tqdm(list(enumerate(folds)), desc="Cross-validation"):
    print(f"\n=== Brand2 - Fold {fold_idx + 1}/5 ===")

    # Load data
    X_tr, y_tr = load_snippets_and_labels(fold['train_files'])
    X_val, y_val = load_snippets_and_labels(fold['test_files'])

    # DataLoader
    train_loader = DataLoader(BatteryDS(X_tr, y_tr), batch_size=128, shuffle=True, collate_fn=collate_fn, num_workers=0)
    val_loader   = DataLoader(BatteryDS(X_val, y_val), batch_size=128, shuffle=False, collate_fn=collate_fn, num_workers=0)

    # 모델 정의
    model = LSTMModel(input_size=8, hidden_size=128, num_layers=3, output_size=2, dropout_rate=0.3).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    # 학습
    fold_prefix = os.path.join(model_dir, f'brand2_fold{fold_idx+1}')
    metrics = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=20, device=device, model_path_prefix=fold_prefix)
    fold_metrics.append(metrics)

0it [00:00, ?it/s]


=== Brand2 - Fold 1/5 ===





TypeError: BatteryDS.__init__() missing 1 required positional argument: 'labels'