In [16]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import TwoMLPHead
from torchvision.models import resnet50
from torchvision import transforms
from collections import OrderedDict
from torchvision.models.detection.image_list import ImageList

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda:2")  # Replace 7 with your desired device index


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((640, 480))  # Resize images to a specific size
])

train_data = torchvision.datasets.CocoDetection(root='data/train2017', annFile='data/annotations/instances_train2017.json', transform=transform)
test_data = torchvision.datasets.CocoDetection(root='data/val2017', annFile='data/annotations/instances_val2017.json', transform=transform)

# Define data loaders with transform
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=4, shuffle=True,
    collate_fn=lambda x: tuple(zip(*x)) if x else []
)
test_loader = torch.utils.data.DataLoader(
    test_data, batch_size=4, shuffle=False,
    collate_fn=lambda x: tuple(zip(*x)) if x else []
)

loading annotations into memory...
Done (t=26.73s)
creating index...
index created!
loading annotations into memory...
Done (t=1.08s)
creating index...
index created!


In [127]:
def print_cuda_device_properties():
    device_count = torch.cuda.device_count()
    print(f"Found {device_count} CUDA device(s)")

    for device_idx in range(device_count):
        print(device_idx)
        device_props = torch.cuda.get_device_properties(device_idx)
        print(f"Device {device_idx}: {device_props.name}")

print_cuda_device_properties()

Found 8 CUDA device(s)
0
Device 0: NVIDIA RTX A6000
1
Device 1: NVIDIA RTX A6000
2
Device 2: NVIDIA RTX A6000
3
Device 3: NVIDIA RTX A6000
4
Device 4: NVIDIA RTX A6000
5
Device 5: NVIDIA RTX A6000
6
Device 6: NVIDIA RTX A6000
7
Device 7: NVIDIA RTX A6000


In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import torch
import torch.nn as nn
import torchvision
from torchvision.models import resnet50
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.faster_rcnn import TwoMLPHead
from torchvision import transforms
from collections import OrderedDict
from torchvision.models.detection.image_list import ImageList

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda:7")  # Replace 7 with your desired device index


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((640, 480))  # Resize images to a specific size
])

train_data = torchvision.datasets.CocoDetection(root='data/train2017', annFile='data/annotations/instances_train2017.json', transform=transform)
test_data = torchvision.datasets.CocoDetection(root='data/val2017', annFile='data/annotations/instances_val2017.json', transform=transform)

def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    images = torch.stack(images)

    return images, targets



# Define data loaders with transform
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=1, shuffle=True,
    collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(
    test_data, batch_size=1, shuffle=False,
    collate_fn=collate_fn)

# Define the loss function
criterion = nn.CrossEntropyLoss()

class SimpleModel(nn.Module):
    def __init__(self, backbone, num_classes):
        super(SimpleModel, self).__init__()
        self.backbone = backbone
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        feature_size = 2048  # Specify the desired feature size
        self.fc = nn.Linear(feature_size, num_classes)
    
    def forward(self, x):
        features = self.backbone(x)
        features = features.unsqueeze(2).unsqueeze(3)  # Reshape the tensor to (batch_size, 2048, 1, 1)
        features = self.avgpool(features)
        features = features.view(features.size(0), -1)
        out = self.fc(features)
        return out


device = torch.device("cuda:7")  # Replace 7 with your desired device index

# Load the ResNet-50 backbone
backbone = models.resnet50(pretrained=True)

# Modify the last layer of the backbone
backbone.fc = nn.Identity()

# Create the model with the modified ResNet-50 backbone
model = SimpleModel(backbone, num_classes=80).to(device)

# Define anchor generator
anchor_sizes = [(32.0 * 2 ** i,) for i in range(5)]  # Updated to tuples
aspect_ratios = [(0.5, 1.0, 2.0) for _ in range(5)]  # Updated to tuples
anchor_generator = AnchorGenerator(sizes=anchor_sizes, aspect_ratios=aspect_ratios)

# Configure quantization-aware training
model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
model_qat = torch.quantization.prepare_qat(model)

# Define the optimizer, learning rate scheduler, and number of epochs
optimizer = torch.optim.SGD(model_qat.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = 80

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])

for epoch in range(num_epochs):
    model_qat.train()
    for images, targets in train_loader:
        images = torch.stack([transform(image.squeeze()) for image in images]).to(device)
        max_segmentation_length = max(len(seg) for image_targets in targets for target in image_targets for seg in target.get('segmentation', []))

        targets = [
            [
                {
                    k: [torch.tensor(vv + [0] * (max_segmentation_length - len(vv))) if isinstance(vv, list) else vv for vv in v]
                    if k == 'segmentation' else v
                    for k, v in target.items()
                }
                for target in image_targets
            ]
            for image_targets in targets
        ]


        optimizer.zero_grad()
        outputs = model_qat(images)
        # Rest of the training code
        
        # Extract category IDs from the labels
        category_ids = [target['category_id'] for image_targets in targets for target in image_targets]
        num_predictions = outputs.size(0)
        # Convert the category IDs to a tensor
        labels = torch.tensor(category_ids, dtype=torch.long).to(device)
        labels = labels[:num_predictions]

        target_batch_size = len([target['category_id'] for image_targets in targets for target in image_targets])

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
    lr_scheduler.step()

    # ...

    
    
    model_int8 = torch.quantization.convert(model_qat.eval())


loading annotations into memory...
Done (t=25.87s)
creating index...
index created!
loading annotations into memory...
Done (t=1.07s)
creating index...
index created!


  return torch.fused_moving_avg_obs_fake_quant(
  return torch.fused_moving_avg_obs_fake_quant(
../aten/src/ATen/native/cuda/Loss.cu:257: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered

In [3]:
outputs

RuntimeError: CUDA error: device-side assert triggered