In [None]:
import os
import cv2
import time
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.amp import GradScaler, autocast
from unet_model import UNet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class DefectDataset(Dataset):
    def __init__(self, image_dir, mask_dirs, img_size=128):
        self.image_dir = image_dir
        self.mask_dirs = mask_dirs
        self.image_filenames = sorted(os.listdir(image_dir))
        self.img_size = img_size

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        try:
            img_name = self.image_filenames[idx]
            img_path = os.path.join(self.image_dir, img_name)

            # Load image
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if image is None:
                raise RuntimeError(f"Failed to load image: {img_path}")

            image = cv2.resize(image, (self.img_size, self.img_size))
            image = image.astype(np.float32) / 255.0
            image = np.expand_dims(image, axis=0)

            # Load masks
            masks = []
            for mask_dir in self.mask_dirs:
                mask_path = os.path.join(mask_dir, img_name)
                mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
                if mask is None:
                    raise RuntimeError(f"Failed to load mask: {mask_path}")
                mask = cv2.resize(mask, (self.img_size, self.img_size))
                masks.append(mask)

            mask = np.stack(masks, axis=0).astype(np.float32) / 255.0
            return torch.tensor(image, dtype=torch.float32), torch.tensor(mask, dtype=torch.float32)

        except Exception as e:
            print(f"[WARNING] Skipping sample at index {idx}: {e}")
            # Move to next index (wrap around if needed)
            new_idx = (idx + 1) % len(self)
            return self.__getitem__(new_idx)

In [None]:
# Paths
base_dir = "/content/drive/MyDrive/Info_Project/Defect_Detection/DataSets/Data.Splitting/After_Melting_Defect_Detection"
train_dir = os.path.join(base_dir, "train")
val_dir = os.path.join(base_dir, "val")

In [None]:
# Load datasets
train_dataset = DefectDataset(
    image_dir=os.path.join(train_dir, "Img.After.Melting"),
    mask_dirs=[os.path.join(train_dir, f"Defect_Class{i}") for i in [0, 5, 8, 9, 10, 11]]
)
val_dataset = DefectDataset(
    image_dir=os.path.join(val_dir, "Img.After.Melting"),
    mask_dirs=[os.path.join(val_dir, f"Defect_Class{i}") for i in [0, 5, 8, 9, 10, 11]]
)

# Create DataLoaders
num_workers = min(4, os.cpu_count() // 2)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=num_workers, pin_memory=True)

In [None]:
# Define model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
model = UNet(in_channels=1, out_channels=6).to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop with optimizations
num_epochs = 6
best_val_loss = float("inf")
scaler = GradScaler()

for epoch in range(num_epochs):
    start_time = time.time()

    # Training Phase
    model.train()
    running_loss = 0.0
    train_loader_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Training]")

    for images, masks in train_loader_iter:
        images, masks = images.to(device, non_blocking=True), masks.to(device, non_blocking=True)

        optimizer.zero_grad()

        # Enable automatic mixed precision for better efficiency
        with autocast("cuda"):
            outputs = model(images)
            loss = criterion(outputs, masks)

        # Scale loss for stable training
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        train_loader_iter.set_postfix(avg_loss=running_loss / (train_loader_iter.n + 1))  # Show moving avg loss

        # Free memory
        del images, masks, outputs, loss
        torch.cuda.empty_cache()

    avg_train_loss = running_loss / len(train_loader)

    # Validation Phase
    model.eval()
    val_loss = 0.0
    val_loader_iter = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Validation]")

    with torch.no_grad():
        for images, masks in val_loader_iter:
            images, masks = images.to(device, non_blocking=True), masks.to(device, non_blocking=True)

            with autocast("cuda"):
                outputs = model(images)
                loss = criterion(outputs, masks)

            val_loss += loss.item()
            val_loader_iter.set_postfix(avg_loss=val_loss / (val_loader_iter.n + 1))

            del images, masks, outputs, loss
            torch.cuda.empty_cache()

    avg_val_loss = val_loss / len(val_loader)

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), os.path.join(base_dir, "trained_model", "best_unet_model.pth"))
        print("Best model saved!")

    # Print Progress Summary
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Time: {epoch_time:.2f}s")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']}")

    # Free memory after epoch
    torch.cuda.empty_cache()

# Save final model
torch.save(model.state_dict(), os.path.join(base_dir, "trained_model", "after_melting_unet_model_2.pth"))
print("Final model saved!")

Epoch 1/6 [Training]:  54%|█████▍    | 85/157 [1:12:18<56:07, 46.77s/it, avg_loss=0.233]



Epoch 1/6 [Training]: 100%|██████████| 157/157 [2:11:18<00:00, 50.18s/it, avg_loss=0.149]
Epoch 1/6 [Validation]: 100%|██████████| 34/34 [29:36<00:00, 52.26s/it, avg_loss=0.0335]


Best model saved!
Epoch 1/6 - Train Loss: 0.1490, Val Loss: 0.0335, Time: 9664.82s
Learning Rate: 0.001


Epoch 2/6 [Training]:  22%|██▏       | 35/157 [01:29<05:07,  2.52s/it, avg_loss=0.0272]



Epoch 2/6 [Training]: 100%|██████████| 157/157 [06:32<00:00,  2.50s/it, avg_loss=0.0189]
Epoch 2/6 [Validation]: 100%|██████████| 34/34 [01:21<00:00,  2.41s/it, avg_loss=0.0126]


Best model saved!
Epoch 2/6 - Train Loss: 0.0189, Val Loss: 0.0126, Time: 474.87s
Learning Rate: 0.001


Epoch 3/6 [Training]:  15%|█▌        | 24/157 [01:03<05:35,  2.52s/it, avg_loss=0.0115]



Epoch 3/6 [Training]:  65%|██████▍   | 102/157 [04:17<02:16,  2.49s/it, avg_loss=0.0102]



Epoch 3/6 [Training]: 100%|██████████| 157/157 [07:03<00:00,  2.70s/it, avg_loss=0.00937]
Epoch 3/6 [Validation]: 100%|██████████| 34/34 [01:38<00:00,  2.91s/it, avg_loss=0.00716]


Best model saved!
Epoch 3/6 - Train Loss: 0.0094, Val Loss: 0.0072, Time: 522.99s
Learning Rate: 0.001


Epoch 4/6 [Training]:  86%|████████▌ | 135/157 [05:40<00:54,  2.50s/it, avg_loss=0.00657]



Epoch 4/6 [Training]: 100%|██████████| 157/157 [06:33<00:00,  2.51s/it, avg_loss=0.00655]
Epoch 4/6 [Validation]: 100%|██████████| 34/34 [01:22<00:00,  2.41s/it, avg_loss=0.00561]


Best model saved!
Epoch 4/6 - Train Loss: 0.0066, Val Loss: 0.0056, Time: 476.36s
Learning Rate: 0.001


Epoch 5/6 [Training]:  30%|██▉       | 47/157 [01:58<04:35,  2.51s/it, avg_loss=0.00544]



Epoch 5/6 [Training]: 100%|██████████| 157/157 [06:29<00:00,  2.48s/it, avg_loss=0.00532]
Epoch 5/6 [Validation]: 100%|██████████| 34/34 [01:23<00:00,  2.44s/it, avg_loss=0.00483]


Best model saved!
Epoch 5/6 - Train Loss: 0.0053, Val Loss: 0.0048, Time: 472.94s
Learning Rate: 0.001


Epoch 6/6 [Training]:  44%|████▍     | 69/157 [02:52<03:40,  2.50s/it, avg_loss=0.00455]



Epoch 6/6 [Training]: 100%|██████████| 157/157 [06:28<00:00,  2.47s/it, avg_loss=0.00489]
Epoch 6/6 [Validation]: 100%|██████████| 34/34 [01:22<00:00,  2.42s/it, avg_loss=0.00406]


Best model saved!
Epoch 6/6 - Train Loss: 0.0049, Val Loss: 0.0041, Time: 471.16s
Learning Rate: 0.001
Final model saved!
