In [None]:
import os
os.environ['HF_HOME'] = "D:/Workforce_Attire/.cache"

In [1]:
import transformers

In [2]:
feature_extractor = transformers.YolosFeatureExtractor.from_pretrained('hustvl/yolos-tiny')
model = transformers.YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')



In [3]:
feature_extractor

YolosFeatureExtractor {
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "YolosFeatureExtractor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 512
  }
}

In [4]:
import datasets
es = datasets.load_dataset("adam-narozniak/clothing")

In [None]:
def extractor(ex):
    return feature_extractor(ex["image"], return_tensors="pt", size={"height": 800, "width": 800})


def map_to_labels(example):
    objects = example["objects"]
    width = example["width"]
    height = example["height"]
    return {
        "label_ids":
            {
                "class_labels": objects["category"],
                "boxes":  center_bbox_xy(rescale_bboxes_to_img(objects["bbox"], width, height)), # dont know if center_bbox_xy is needed
                #"area":  objects["area"],
                #"iscrowd": torch.Tensor([0]),
                #"orig_size": torch.Tensor([width, height]).int(),
                #"size": torch.Tensor([example['pixel_values'].shape[1:]])[0].int() dont know if needed
            } 
   }
    
small_train_dataset = es["train"].shuffle(seed=42).select(range(100)).map(extractor, batched=True, batch_size=5).with_format(type="pt", columns=['pixel_values'], output_all_columns=True).map(map_to_labels)
small_eval_dataset = es["train"].shuffle(seed=24).select(range(10)).map(extractor, batched=True, batch_size=5).with_format(type="pt", columns=['pixel_values'], output_all_columns=True).map(map_to_labels)

In [5]:
def extractor(ex):
    return feature_extractor(ex["image"], return_tensors="pt", size={"height": 800, "width": 800})


def map_to_labels(example):
    objects = example["objects"]
    return {
        "label_ids":
            {
                "class_labels": objects["category"],
                "boxes":  objects["bbox"]
            } 
   }
    
small_train_dataset = es["train"].shuffle(seed=42).select(range(1000)).map(extractor, batched=True, batch_size=5).map(map_to_labels).with_format(type="pt", columns=['pixel_values'], output_all_columns=True)
small_eval_dataset = es["train"].shuffle(seed=24).select(range(100)).map(extractor, batched=True, batch_size=5).map(map_to_labels).with_format(type="pt", columns=['pixel_values'], output_all_columns=True)

In [6]:
import torch

In [7]:
def YOLODataCollator(inputs):
    return {
        "pixel_values": torch.stack([el["pixel_values"] for el in inputs]),
        "labels": [
            {                
                "class_labels": torch.tensor(el["label_ids"]["class_labels"]),
                "boxes":  torch.tensor(el["label_ids"]["boxes"]).float()
            } 
            for el in inputs
        ]
    }

labels (List[Dict] of len (batch_size,), optional) — Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch respectively). The class labels themselves should be a torch.LongTensor of len (number of bounding boxes in the image,) and the boxes a torch.FloatTensor of shape (number of bounding boxes in the image, 4).

In [8]:
import numpy as np

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="fine-tune",
    evaluation_strategy="epoch",
    label_names=["label_ids"],
    per_device_train_batch_size=2
)

In [9]:
trainer = Trainer(
    data_collator=YOLODataCollator,
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    
)

In [10]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,12461.822,No log
2,12456.442,No log
3,12453.371,No log


TrainOutput(global_step=1500, training_loss=12457.211666666666, metrics={'train_runtime': 388.6326, 'train_samples_per_second': 7.719, 'train_steps_per_second': 3.86, 'total_flos': 2.2425071616e+17, 'train_loss': 12457.211666666666, 'epoch': 3.0})

In [34]:
small_eval_dataset_pre = es["train"].shuffle(seed=24).select(range(100))

In [35]:
example = small_eval_dataset_pre.map(map_to_labels)[0]
example

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1080x1080>,
 'objects': {'bbox_id': [4439],
  'category': [8],
  'bbox': [[11, 50, 1055, 1025]],
  'area': [1017900],
  'genre': [None]},
 'width': 1080,
 'height': 1080,
 'labels': [{'boxes': [11, 50, 1055, 1025], 'class_labels': 8}]}

In [24]:
small_eval_dataset[0]

{'pixel_values': tensor([[[-0.0458,  0.1597,  0.0912,  ...,  1.0159,  1.0331,  1.0331],
          [-0.2856, -0.1314,  0.0227,  ...,  1.0502,  1.0673,  1.0673],
          [-0.3027, -0.2684, -0.3198,  ...,  1.1015,  1.1187,  1.0844],
          ...,
          [ 1.5125,  1.5125,  1.5125,  ...,  0.3309,  0.8447,  1.2385],
          [ 1.5125,  1.5125,  1.5125,  ...,  0.2624,  0.3652,  1.0673],
          [ 1.5125,  1.5125,  1.5125,  ...,  0.4166,  0.3994,  0.9988]],
 
         [[-0.5826, -0.3725, -0.4426,  ...,  1.1506,  1.1856,  1.1856],
          [-0.8277, -0.6702, -0.5126,  ...,  1.1856,  1.2206,  1.2206],
          [-0.8452, -0.8102, -0.8452,  ...,  1.2381,  1.2731,  1.2381],
          ...,
          [ 1.6758,  1.6758,  1.6758,  ...,  0.2752,  0.8004,  1.2031],
          [ 1.6758,  1.6758,  1.6758,  ...,  0.2052,  0.3102,  1.0280],
          [ 1.6758,  1.6758,  1.6758,  ...,  0.3627,  0.3452,  0.9580]],
 
         [[-0.5147, -0.3230, -0.4101,  ...,  1.3328,  1.4025,  1.4374],
          [-

In [16]:
es["train"][1]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1080x1080>,
 'objects': {'bbox_id': [2, 3, 4],
  'category': [13, 0, 4],
  'bbox': [[587, 595, 674, 646], [591, 747, 663, 996], [504, 687, 796, 1099]],
  'area': [4437, 17928, 120304],
  'genre': ['woman', 'woman', 'woman']},
 'width': 1080,
 'height': 1080}

In [18]:
feature_extractor(es["train"][0]["image"], return_tensors="pt", size={"height": 800, "width": 800})

{'pixel_values': tensor([[[[2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          ...,
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489]],

         [[2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          ...,
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286]],

         [[2.6400, 2.6400, 2.6400,  ..., 2.6400, 2.6400, 2.6400],
          [2.6400, 2.6400, 2.6400,  ..., 2.6400, 2.6400, 2.6400],
          [2.6400, 2.6400