In [1]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import json
import os


class CustomDataset(Dataset):
    def __init__(self, image_folder, annotation_file, transform=None):
        self.image_folder = image_folder
        self.transform = transform

        with open(annotation_file, "r") as f:
            self.annotations = json.load(f)
        

    def __len__(self):
        return len(self.annotations["images"])

    def __getitem__(self, idx):
        img_name = self.annotations["images"][idx]["file_name"]
        img_path = os.path.join(os.getcwd() + "\\" + self.image_folder, img_name)
        
        # Загрузка изображения
        image = Image.open(img_path).convert("RGB")
        # id изображения для маппинга в аннотации
        image_id = self.annotations["images"][idx]["id"]
        annotations = [ann for ann in self.annotations["annotations"] if ann["image_id"] == image_id]

        for i in range(len(annotations)):
            annotations[i] = {k:v for k, v in annotations[i].items() if k in ["bbox", "category_id"]}

        sample = {"image": image, "annotations": annotations}

        if self.transform:
            sample['image'] = self.transform(sample["image"])

        return sample

In [29]:
from torchvision import transforms
train_dataset = CustomDataset(image_folder="data\\train_images\\train_images", annotation_file="data\\usdc_train.json", transform=transforms.ToTensor())

sample = train_dataset[0]
sample

{'image': tensor([[[0.9373, 0.9333, 0.9255,  ..., 0.9922, 0.9922, 0.9922],
          [0.9765, 0.9725, 0.9647,  ..., 0.9882, 0.9882, 0.9882],
          [0.9765, 0.9765, 0.9725,  ..., 0.9804, 0.9765, 0.9765],
          ...,
          [0.0745, 0.0745, 0.0745,  ..., 0.1255, 0.1255, 0.1255],
          [0.0745, 0.0745, 0.0745,  ..., 0.1255, 0.1255, 0.1255],
          [0.0745, 0.0745, 0.0745,  ..., 0.1255, 0.1255, 0.1255]],
 
         [[0.9725, 0.9686, 0.9608,  ..., 0.9882, 0.9922, 0.9922],
          [1.0000, 1.0000, 1.0000,  ..., 0.9922, 0.9961, 0.9961],
          [1.0000, 1.0000, 1.0000,  ..., 0.9961, 1.0000, 1.0000],
          ...,
          [0.0745, 0.0745, 0.0745,  ..., 0.1294, 0.1294, 0.1294],
          [0.0745, 0.0745, 0.0745,  ..., 0.1294, 0.1294, 0.1294],
          [0.0745, 0.0745, 0.0745,  ..., 0.1294, 0.1294, 0.1294]],
 
         [[0.9608, 0.9569, 0.9490,  ..., 1.0000, 1.0000, 0.9922],
          [1.0000, 0.9961, 0.9882,  ..., 1.0000, 0.9922, 0.9843],
          [1.0000, 1.0000, 0.99

In [63]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torch.utils.data import DataLoader
from tqdm import tqdm
from torchvision import transforms
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
from torchvision.ops import box_convert


def collate_fn(batch):
    images = [item['image'] for item in batch]

    res = [] 
    for item in batch:
        d = {}
        _boxes = torch.Tensor()
        _labels = []
        for ann in item['annotations']:
            _boxes = torch.cat([_boxes, box_convert(torch.Tensor(ann["bbox"]).unsqueeze(0), "xywh", "xyxy")], dim=0)
            _labels.append(ann["category_id"])
        #_boxes = torch.Tensor(_boxes)
        _labels = torch.Tensor(_labels, dtype=torch.long)
        d["boxes"] = _boxes
        d["labels"] = _labels
        res.append(d)
    print({"images": images, "targets": res})
    return {"images": images, "targets": res}


device = torch.device("cpu")#torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT) # pretrained=False
model.to(device)

train_dataset = CustomDataset(image_folder="data\\train_images\\train_images", annotation_file="data\\usdc_train.json", transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
criterion = torch.nn.CrossEntropyLoss()


Using device: cpu


In [64]:
next(iter(train_loader))

TypeError: new() received an invalid combination of arguments - got (list, dtype=torch.dtype), but expected one of:
 * (*, torch.device device)
      didn't match because some of the keywords were incorrect: dtype
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, *, torch.device device)
 * (object data, *, torch.device device)


In [57]:
from torchvision.ops import box_convert
# Обучение модели
num_epochs = 10
for epoch in range(num_epochs):
    epoch_loss = 0.0
    with tqdm(total=len(train_loader), desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for batch in train_loader:
            images, targets = batch['images'], batch['targets']
            
            print("Images: ", len(images))
            print("Targets: ", targets)
            # Обработка батча для передачи в модель
            images = [image.to(device) for image in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # print(targets)
            optimizer.zero_grad()
            outputs = model(images, targets)
            loss = sum(loss for loss in outputs.values())
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            pbar.set_postfix({'Loss': loss.item()})
            pbar.update(1)

        lr_scheduler.step()

        
    average_loss = epoch_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}')
    
# Сохранение обученной модели
torch.save(model.state_dict(), 'trained_model.pth')


Epoch 1/10:   0%|          | 0/6000 [00:00<?, ?batch/s]


TypeError: new() received an invalid combination of arguments - got (list, dtype=torch.dtype), but expected one of:
 * (*, torch.device device)
      didn't match because some of the keywords were incorrect: dtype
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, *, torch.device device)
 * (object data, *, torch.device device)


In [41]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)

images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
boxes[:, :, 2:4] = boxes[:, :, 0:2] + boxes[:, :, 2:4]
labels = torch.randint(1, 91, (4, 11))
images = list(image for image in images)
targets = []
for i in range(len(images)):
    d = {}
    d['boxes'] = boxes[i]
    d['labels'] = labels[i]
    targets.append(d)
    
output = model(images, targets)
targets

[{'boxes': tensor([[0.7492, 0.6462, 1.6656, 0.7081],
          [0.1843, 0.2743, 0.9093, 0.7881],
          [0.3999, 0.7813, 0.7531, 1.4787],
          [0.5045, 0.0686, 0.9878, 0.5690],
          [0.6464, 0.3755, 0.8134, 1.3167],
          [0.1473, 0.9331, 0.4805, 1.6602],
          [0.1301, 0.2448, 0.4951, 0.3847],
          [0.9600, 0.0694, 1.8587, 1.0188],
          [0.4960, 0.5041, 1.1381, 1.3830],
          [0.5330, 0.3508, 0.5611, 1.2173],
          [0.6171, 0.8895, 1.5419, 1.0864]]),
  'labels': tensor([25, 89,  1, 62, 55, 77, 42, 62, 23, 33, 66])},
 {'boxes': tensor([[0.6264, 0.8237, 1.4536, 1.4076],
          [0.1774, 0.8349, 1.0666, 1.2072],
          [0.0416, 0.3344, 0.5200, 0.6437],
          [0.9647, 0.3202, 1.8209, 0.5414],
          [0.8127, 0.1840, 1.7877, 0.4728],
          [0.6258, 0.2144, 0.7492, 1.1665],
          [0.5382, 0.8123, 1.0069, 1.4412],
          [0.8278, 0.7779, 1.1098, 1.4268],
          [0.9727, 0.8746, 1.5330, 1.3195],
          [0.1366, 0.9256, 0.4242