In [1]:
import pandas as pd
from src.train import train_loop
from src.model import CNNLSTMModel
from src.dataset import InverterTimeSeriesDataset
from src.preprocess import *
import torch

inverter_data = load_parquet_data('data/inverter_data')
failure_sessions = load_failure_sessions('data/failure_sessions.csv', min_days=3)

Loaded 15 parquet files → 6126272 rows
Kept 61 sessions longer than 3 days


In [2]:
#inverter_data = inverter_data[inverter_data['device_name']=='INV 51']

In [3]:
labeled_df = prepare_dataset(inverter_data, failure_sessions)
labeled_df.fillna(0, inplace=True)

Total pre-failure rows: 82486
Total rows: 5905370


In [4]:
feature_cols = [
    "metric.AC_POWER.MEASURED",
#    "metric.DC_POWER.MEASURED",
#    "metric.AC_CURRENT_A.MEASURED",
#    "metric.AC_CURRENT_B.MEASURED",
#    "metric.AC_CURRENT_C.MEASURED",
#    "metric.DC_CURRENT.MEASURED",
#    "metric.DC_CURRENT_AVG.MEASURED",
#    "metric.DC_CURRENT_MAX.MEASURED",
#    "metric.FREQUENCY.MEASURED",
#    "metric.POWER_FACTOR.MEASURED",
#    "metric.HEARTBEAT.MEASURED",
#    "metric.COMM_LINK.MEASURED",
#    "metric.STATUS_WARNING_WORD.MEASURED",
#    "metric.STATUS_FAULT_WORD.MEASURED",
#    "metric.STATUS_IGBT_MAX_TEMP.MEASURED",
#    "metric.STATUS_INTERNAL_TEMP.MEASURED",
#    "metric.STATUS_INTERNAL_HUMIDITY.MEASURED"
]

train_df, test_df = train_test_split_on_time(labeled_df, 0.2)
val_df, test_df = train_test_split_on_time(test_df, 0.5)

from torch.utils.data import DataLoader

train_ds = InverterTimeSeriesDataset(train_df, feature_cols)
val_ds   = InverterTimeSeriesDataset(val_df,   feature_cols)
test_ds  = InverterTimeSeriesDataset(test_df,  feature_cols)

Train set size: 4724296 Train set time range: 2021-12-02 00:00:00 to 2024-10-27 01:00:00
Test set size: 1181074 Test set time range: 2024-10-27 01:00:00 to 2025-07-23 23:35:00
Train set size: 590537 Train set time range: 2024-10-27 01:00:00 to 2025-03-04 04:40:00
Test set size: 590537 Test set time range: 2025-03-04 04:40:00 to 2025-07-23 23:35:00


Processing devices: 100%|██████████| 16/16 [00:13<00:00,  1.22it/s]
Processing devices: 100%|██████████| 16/16 [00:01<00:00,  8.17it/s]
Processing devices: 100%|██████████| 16/16 [00:01<00:00,  8.96it/s]


In [5]:
batch_size = 2**15
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=6, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True)

In [6]:
model = CNNLSTMModel(num_features=len(feature_cols))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

In [7]:
import torch
from torch.utils.data import DataLoader

def train_loop(model,
               train_loader: DataLoader,
               val_loader: DataLoader = None,
               device='cuda',
               optimizer=None,
               criterion=None,
               num_epochs=10,
               scheduler=None,
               log_interval=100):
    """
    通用 PyTorch 訓練迴圈，適用 CNN+LSTM 二分類模型
    """

    model = model.to(device)
    print(f"Model moved to {device}")

    for epoch in range(1, num_epochs + 1):
        model.train()
        print(f"🔁 Starting epoch {epoch}/{num_epochs}")
        total_loss = 0

        for batch_idx, (X, y) in enumerate(train_loader):
            #print(X.shape, y.shape)  # Debugging shape
            X = X.to(device, non_blocking=True)  # 建議加 non_blocking=True
            y = y.to(device, non_blocking=True)
            optimizer.zero_grad()
            output = model(X)
            output = output.squeeze()  # [B] if needed

            loss = criterion(output, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if batch_idx % log_interval == 0:
                print(f"[Epoch {epoch}/{num_epochs}] Step {batch_idx}/{len(train_loader)} - Loss: {loss.item():.4f}")

        avg_train_loss = total_loss / len(train_loader)
        print(f"🔁 Epoch {epoch} finished. Avg Train Loss: {avg_train_loss:.4f}")

        # 驗證階段
        if val_loader is not None:
            model.eval()
            val_loss = 0
            correct = 0
            total = 0

            with torch.no_grad():
                for X_val, y_val in val_loader:
                    X_val, y_val = X_val.to(device), y_val.to(device).float()
                    output = model(X_val).squeeze()
                    val_loss += criterion(output, y_val).item()

                    pred = torch.sigmoid(output) > 0.5
                    correct += (pred == y_val).sum().item()
                    total += y_val.size(0)

            avg_val_loss = val_loss / len(val_loader)
            accuracy = correct / total
            print(f"✅ Validation Loss: {avg_val_loss:.4f} | Accuracy: {accuracy:.2%}")

        # scheduler step if used
        if scheduler is not None:
            scheduler.step()

    print("🏁 Training completed.")

In [8]:
train_loop(model, train_loader, log_interval=10, num_epochs=10, optimizer=optimizer, criterion=criterion)

Model moved to cuda
🔁 Starting epoch 1/10
[Epoch 1/10] Step 0/145 - Loss: 0.2636
[Epoch 1/10] Step 10/145 - Loss: 0.2244
[Epoch 1/10] Step 20/145 - Loss: 0.1674
[Epoch 1/10] Step 30/145 - Loss: 0.0791
[Epoch 1/10] Step 40/145 - Loss: 0.0224
[Epoch 1/10] Step 50/145 - Loss: 0.0155
[Epoch 1/10] Step 60/145 - Loss: 0.0148
[Epoch 1/10] Step 70/145 - Loss: 0.0152
[Epoch 1/10] Step 80/145 - Loss: 0.0136
[Epoch 1/10] Step 90/145 - Loss: 0.0136
[Epoch 1/10] Step 100/145 - Loss: 0.0127
[Epoch 1/10] Step 110/145 - Loss: 0.0134
[Epoch 1/10] Step 120/145 - Loss: 0.0136
[Epoch 1/10] Step 130/145 - Loss: 0.0130
[Epoch 1/10] Step 140/145 - Loss: 0.0148
🔁 Epoch 1 finished. Avg Train Loss: 0.0531
🔁 Starting epoch 2/10
[Epoch 2/10] Step 0/145 - Loss: 0.0141


KeyboardInterrupt: 