In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler
import time

In [2]:
# Define LoRA module
class LoRA(nn.Module):
    def __init__(self, in_features, out_features, rank=4, alpha=1):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scaling = self.alpha / self.rank

        # Low-rank decomposition matrices
        self.lora_A = nn.Parameter(torch.zeros((rank, in_features)))
        self.lora_B = nn.Parameter(torch.zeros((out_features, rank)))

        # Initialize weights
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        # Low-rank adaptation
        return self.scaling * (x @ self.lora_A.T @ self.lora_B.T)


In [3]:
# Modify a linear layer with LoRA
class LinearWithLoRA(nn.Module):
    def __init__(self, linear_layer, rank=4, alpha=1):
        super().__init__()
        self.linear = linear_layer
        self.lora = LoRA(
            linear_layer.in_features,
            linear_layer.out_features,
            rank=rank,
            alpha=alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [4]:
# Custom dataset for furniture frames
class FurnitureFramesDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.samples = []

        # Dictionary to map class names to indices
        self.class_to_idx = {}

        # Load all image paths and labels
        folders = [f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))]
        for idx, folder in enumerate(folders):
            self.class_to_idx[folder] = idx
            folder_path = os.path.join(root_dir, folder)

            # For each subfolder (01, etc.)
            subfolders = [sf for sf in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, sf))]
            for subfolder in subfolders:
                subfolder_path = os.path.join(folder_path, subfolder)
                image_files = [f for f in os.listdir(subfolder_path) if f.endswith('.jpg')]

                for img_file in image_files:
                    img_path = os.path.join(subfolder_path, img_file)
                    self.samples.append((img_path, idx))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, label

In [5]:
def apply_lora_to_model(model, rank=4, alpha=1):
    """Apply LoRA to all linear layers in the model"""
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            parent_name = '.'.join(name.split('.')[:-1])
            child_name = name.split('.')[-1]

            if parent_name:
                parent = model
                for attr in parent_name.split('.'):
                    parent = getattr(parent, attr)
                setattr(parent, child_name, LinearWithLoRA(module, rank=rank, alpha=alpha))
            else:
                setattr(model, child_name, LinearWithLoRA(module, rank=rank, alpha=alpha))

    return model

In [6]:
def print_gpu_info():
    """Print GPU information if available"""
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
        print(f"GPU Memory Reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
        print(f"CUDA Version: {torch.version.cuda}")


In [7]:
# measure the model size
def get_model_size(model):
    # Method 1: Calculate memory footprint from parameters
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print(f"Model size: {size_all_mb:.3f} MB")

    # Method 2: Calculate LoRA adapter size (trainable parameters only)
    lora_param_size = 0
    for name, param in model.named_parameters():
        if 'lora' in name:
            lora_param_size += param.nelement() * param.element_size()

    lora_size_mb = lora_param_size / 1024**2
    print(f"LoRA adapter size: {lora_size_mb:.3f} MB")
    print(f"LoRA is {lora_size_mb/size_all_mb*100:.2f}% of full model size")

    # Method 3: Save and check file sizes
    import os
    # Save full model
    torch.save(model.state_dict(), "temp_full_model.pth")
    full_saved_size = os.path.getsize("temp_full_model.pth") / 1024**2

    # Save only LoRA weights
    lora_state_dict = {name: param for name, param in model.state_dict().items() if 'lora' in name}
    torch.save(lora_state_dict, "temp_lora_only.pth")
    lora_saved_size = os.path.getsize("temp_lora_only.pth") / 1024**2

    print(f"Saved full model size: {full_saved_size:.3f} MB")
    print(f"Saved LoRA weights size: {lora_saved_size:.3f} MB")
    print(f"Saved LoRA is {lora_saved_size/full_saved_size*100:.2f}% of saved full model")

    # Clean up temporary files
    os.remove("temp_full_model.pth")
    os.remove("temp_lora_only.pth")

    return {
        'full_model_mb': size_all_mb,
        'lora_adapter_mb': lora_size_mb,
        'saved_full_model_mb': full_saved_size,
        'saved_lora_only_mb': lora_saved_size
    }

In [9]:
def main():
    # Set device
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"CUDA Device Count: {torch.cuda.device_count()}")
        # Set a specific GPU if multiple are available
        # torch.cuda.set_device(0) # Use first GPU
    else:
        device = torch.device("cpu")
        print("CUDA is not available. Using CPU instead.")

    # Data augmentation and normalization
    data_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # Load dataset
    dataset = FurnitureFramesDataset(
        root_dir='data/masked_furniture_frames',
        transform=data_transforms
    )

    # Print dataset statistics
    print(f"Total samples: {len(dataset)}")
    print(f"Classes: {dataset.class_to_idx}")

    # Split dataset into training and validation sets
    train_indices, val_indices = train_test_split(
        range(len(dataset)),
        test_size=0.2,
        stratify=[label for _, label in dataset.samples],
        random_state=42
    )

    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    val_dataset = torch.utils.data.Subset(dataset, val_indices)

    # Create data loaders with GPU acceleration
    batch_size = 16
    num_workers = 4
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                             num_workers=num_workers, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size,
                           num_workers=num_workers, pin_memory=True)

    # Load pre-trained model
    model = models.resnet50(pretrained=True)
    num_classes = len(dataset.class_to_idx)
    model.fc = nn.Linear(model.fc.in_features, num_classes)

    # Apply LoRA to the model
    lora_rank = 4  # Low-rank dimension
    model = apply_lora_to_model(model, rank=lora_rank, alpha=8)
    model = model.to(device)

    # Print model and GPU details
    print_gpu_info()
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()

    # Only train LoRA parameters
    lora_params = []
    for name, param in model.named_parameters():
        if 'lora' in name:
            param.requires_grad = True
            lora_params.append(param)
        else:
            param.requires_grad = False

    optimizer = optim.AdamW(lora_params, lr=0.002, weight_decay=0.01)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

    # Enable mixed precision training (faster on GPU)
    scaler = GradScaler()

    # Training loop
    num_epochs = 25
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        start_time = time.time()

        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad()

            # Use mixed precision for forward pass
            with autocast():
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)

            # Scale gradients and optimize
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        scheduler.step()

        epoch_loss = running_loss / len(train_dataset)
        epoch_acc = running_corrects.double() / len(train_dataset)
        time_elapsed = time.time() - start_time

        print(f'Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
        print(f'Time: {time_elapsed:.2f}s')

        # Validation phase
        model.eval()
        running_loss = 0.0
        running_corrects = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.to(device, non_blocking=True)
                labels = labels.to(device, non_blocking=True)

                with autocast():
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(val_dataset)
        epoch_acc = running_corrects.double() / len(val_dataset)

        print(f'Val Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        # Print GPU memory usage
        if torch.cuda.is_available():
            print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
            print(f"GPU Memory Reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

        # Save the best model
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'accuracy': epoch_acc,
            }, 'furniture_lora_best_model.pth')

    print(f'Best validation accuracy: {best_acc:.4f}')

    # Save LoRA weights separately (for easier deployment)
    lora_state_dict = {}
    for name, param in model.named_parameters():
        if 'lora' in name:
            lora_state_dict[name] = param.data

    torch.save(lora_state_dict, 'furniture_lora_weights.pth')
    print("Training complete. LoRA weights saved.")

    # Measure inference time
    model.eval()
    dummy_input = torch.randn(1, 3, 224, 224, device=device)

    # Warm up
    with torch.no_grad():
        for _ in range(10):
            _ = model(dummy_input)

    # Benchmark
    torch.cuda.synchronize()
    start_time = time.time()
    with torch.no_grad():
        for _ in range(100):
            _ = model(dummy_input)
    torch.cuda.synchronize()
    end_time = time.time()

    inference_time = (end_time - start_time) / 100
    print(f"Average inference time: {inference_time * 1000:.2f} ms")

    # Print model size information
    model_size_info = get_model_size(model)

    # You can also compare different LoRA ranks
    print("\n--- LoRA Size Comparison ---")
    print(f"Current LoRA rank: {lora_rank}")
    print(f"Current LoRA size: {model_size_info['lora_adapter_mb']:.3f} MB")
    print(f"Estimated size with rank=2: {model_size_info['lora_adapter_mb'] * 2/lora_rank:.3f} MB")
    print(f"Estimated size with rank=8: {model_size_info['lora_adapter_mb'] * 8/lora_rank:.3f} MB")
    print("----------------------------\n")

if __name__ == '__main__':
    main()

CUDA is not available. Using CPU instead.
Total samples: 2550
Classes: {'kitchen_table': 0, 'drawer': 1, 'bench': 2, 'bed': 3, 'desk': 4, 'cushion': 5, 'armchair': 6, 'recliner': 7, 'ottoman': 8, 'stool': 9, 'coffee_table': 10, 'cabinet': 11, 'pendulum': 12, 'footstool': 13, 'chair': 14, 'beanbag': 15, 'sofa': 16, 'table': 17, 'folding_chair': 18, 'dining_table': 19}


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/hannahkim/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 63.0MB/s]


Total parameters: 23,557,284
Trainable parameters: 23,557,284
Epoch 1/25
----------


Traceback (most recent call last):
Traceback (most recent call last):
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
  File "/opt/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
  File "/opt/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
    exitcode = _main(fd, parent_sentinel)
        exitcode = _main(fd, parent_sentinel)exitcode = _main(fd, parent_sentinel)

                        ^ ^ ^ ^  ^ ^^  ^  ^ ^ ^  ^  ^  ^  ^  ^ ^ ^  ^ ^ ^ ^ ^ ^^ ^^^^^ ^^ ^^ ^ ^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/multiprocessin

RuntimeError: DataLoader worker (pid(s) 19888, 19889, 19890, 19891) exited unexpectedly