In [52]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from PIL import Image
from torchvision.io import read_image
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
from torchvision.transforms.functional import InterpolationMode
from transformers import (
    Trainer,
    TrainingArguments,
    AutoProcessor,
    Owlv2ForObjectDetection
)
from pathlib import Path
import jsonlines

# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

# Define the path to the directory containing your images and annotations
image_dir = data_dir / "images"
annotation_file = data_dir / "vlm.jsonl"

# Initialize the OWLv2 model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "google/owlv2-base-patch16-ensemble"
processor = AutoProcessor.from_pretrained(model_name)
model = Owlv2ForObjectDetection.from_pretrained(model_name)
model.to(device)

/home/jupyter/novice /home/jupyter/til-24-base/vlm


Owlv2ForObjectDetection(
  (owlv2): Owlv2Model(
    (text_model): Owlv2TextTransformer(
      (embeddings): Owlv2TextEmbeddings(
        (token_embedding): Embedding(49408, 512)
        (position_embedding): Embedding(16, 512)
      )
      (encoder): Owlv2Encoder(
        (layers): ModuleList(
          (0-11): 12 x Owlv2EncoderLayer(
            (self_attn): Owlv2Attention(
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (mlp): Owlv2MLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=512, out_features=2048, bias=True)
              (fc2): Linear(in_features=2048, out_features=

In [68]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import json

class CustomObjectDetectionDataset(Dataset):
    def __init__(self, annotation_file, image_dir, processor):
        self.annotation_file = annotation_file
        self.image_dir = image_dir
        self.processor = processor
        
        # Load annotations
        with open(annotation_file, 'r') as f:
            self.annotations = [json.loads(line) for line in f.readlines()]

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        annotation = self.annotations[idx]
        image_path = self.image_dir / annotation["image"]
        image = Image.open(image_path).convert("RGB")
        
        # Process annotations
        labels = [ann["caption"] for ann in annotation["annotations"]]
        boxes = [ann["bbox"] for ann in annotation["annotations"]]
        encoded_labels = self.processor(labels, padding=True, truncation=True, return_tensors="pt")
        
        return image, {
            "labels": encoded_labels.input_ids,
            "boxes": torch.tensor(boxes, dtype=torch.float)
        }

In [71]:
from torchvision.transforms import functional as F
from torchvision.transforms import Compose, Resize, ToTensor

class ObjectDetectionTransform:
    def __init__(self, target_size=(416, 416), device="cpu"):
        self.target_size = target_size
        self.device = device

    def __call__(self, image, target):
        # Check if image is a tuple (this happens when using DataLoader with collate_fn)
        if isinstance(image, tuple):
            image = image[0]
        
        # Resize image
        image = F.resize(image, self.target_size)
        
        # Convert image to tensor
        image = F.to_tensor(image).to(self.device)
        
        # Check if target is a tuple (this happens when using DataLoader with collate_fn)
        if isinstance(target, tuple):
            target = target[0]
        
        # Move target to device
        target = {key: value.to(self.device) for key, value in target.items()}
        
        return image, target

transform = ObjectDetectionTransform(device=device)

In [72]:
from torch.utils.data import DataLoader, random_split

# Create dataset
dataset = CustomObjectDetectionDataset(annotation_file, image_dir, processor)

# Define train and validation split (you can adjust the ratio as needed)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Define batch size
batch_size = 8

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: list(zip(*batch)))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: list(zip(*batch)))

In [79]:
def get_preprocessed_image(pixel_values):
    pixel_values = pixel_values.squeeze().numpy()
    unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
    unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
    unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
    unnormalized_image = Image.fromarray(unnormalized_image)
    return unnormalized_image

unnormalized_image = get_preprocessed_image(inputs.pixel_values)

target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
results = processor.post_process_object_detection(
    outputs=outputs, threshold=0.2, target_sizes=target_sizes
)

for i, result in enumerate(results):
    print(f"Result {i}:")
    for key, value in result.items():
        print(f"{key}: {value.size()}")

NameError: name 'inputs' is not defined

In [78]:
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Define number of epochs
num_epochs = 10

# Move model to device
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for images, targets in train_loader:
        # Apply transformations and move data to device
        images, targets = transform(images, targets)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(pixel_values=images, attention_mask=None, input_ids=None)
        
        # Compute loss
        loss = sum(loss for loss in outputs.losses.values())
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
    
    # Print average loss per epoch
    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

print('Training finished!')

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 3 but got size 768 for tensor number 1 in the list.

In [None]:
# Save the model state dict
torch.save(model.state_dict(), "fine_tuned_model.pth")

# Save the entire model including architecture and parameters
torch.save(model, "fine_tuned_model_full.pth")

# Optionally, save optimizer state
torch.save(optimizer.state_dict(), "optimizer_state.pth")

In [49]:
# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))
dataset = DatasetDict({'train': train_dataset, 'val': val_dataset, 'test': test_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['bbox', 'input_ids', 'attention_mask'],
        num_rows: 19
    })
    val: Dataset({
        features: ['bbox', 'input_ids', 'attention_mask'],
        num_rows: 2
    })
    test: Dataset({
        features: ['bbox', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
})

In [51]:
from sklearn.model_selection import train_test_split

# Extract image paths, bounding boxes, and labels
image_paths = dataset['image']
bboxes = dataset['bbox']
labels = dataset['label']

# Split indices for the original train, val, and test sets
train_indices, val_test_indices = train_test_split(range(len(image_paths)), test_size=0.2, random_state=42)
val_indices, test_indices = train_test_split(val_test_indices, test_size=0.5, random_state=42)

# Define new datasets based on the split indices
train_dataset = Dataset.from_dict({
    'image': [image_paths[i] for i in train_indices],
    'bbox': [bboxes[i] for i in train_indices],
    'label': [labels[i] for i in train_indices]
})
val_dataset = Dataset.from_dict({
    'image': [image_paths[i] for i in val_indices],
    'bbox': [bboxes[i] for i in val_indices],
    'label': [labels[i] for i in val_indices]
})
test_dataset = Dataset.from_dict({
    'image': [image_paths[i] for i in test_indices],
    'bbox': [bboxes[i] for i in test_indices],
    'label': [labels[i] for i in test_indices]
})

# Print the lengths of the datasets
print("Train dataset length:", len(train_dataset))
print("Validation dataset length:", len(val_dataset))
print("Test dataset length:", len(test_dataset))

# Define training arguments
training_args = TrainingArguments(
    output_dir=current_directory / "output",
    eval_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir=current_directory / "logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.feature_extractor,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model
model.save_pretrained(current_directory / "finetuned_model")

KeyError: 'image'

In [16]:
image_size = 512  # This should be the expected input size for OWLv2 model
image_mean = [0.485, 0.456, 0.406]  # Standard mean for normalization
image_std = [0.229, 0.224, 0.225]   # Standard std for normalization

# Define image transformation class
class Transform(torch.nn.Module):
    def __init__(self, image_size, mean, std):
        super().__init__()
        self.transforms = torch.nn.Sequential(
            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
            CenterCrop(image_size),
            ConvertImageDtype(torch.float),
            Normalize(mean, std),
        )

    def forward(self, x) -> torch.Tensor:
        x = self.transforms(x)
        return x

# Initialize torchvision transforms
image_transformations = Transform(image_size, image_mean, image_std)
image_transformations = torch.jit.script(image_transformations)

# Preprocess dataset
def preprocess_dataset(dataset, split):
    data = dataset[split]

    def transform_images(examples):
        images = [read_image(image_file) for image_file in examples['image']]
        examples['pixel_values'] = [image_transformations(image) for image in images]
        return examples

    data = data.map(transform_images, batched=True, desc=f"Processing {split} dataset")
    return data

In [17]:
train_dataset = preprocess_dataset(dataset, "train")
val_dataset = preprocess_dataset(dataset, "val")



Processing train dataset:   0%|          | 0/24 [00:00<?, ? examples/s]

Processing val dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

In [33]:
print(dataset['train'][0])
print(dataset['val'][0])
print(dataset['test'][0])

{'image': '/home/jupyter/novice/images/image_9.jpg', 'bbox': [1112, 216, 88, 28], 'label': 'white and orange commercial aircraft'}
{'image': '/home/jupyter/novice/images/image_0.jpg', 'bbox': [800, 320, 128, 36], 'label': 'blue and white commercial aircraft'}
{'image': '/home/jupyter/novice/images/image_0.jpg', 'bbox': [688, 400, 56, 36], 'label': 'green light aircraft'}


In [43]:
from torchvision.io import read_image
from transformers import CLIPProcessor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = processor.tokenizer

# Preprocess dataset function
def preprocess_dataset(dataset, split):
    data = dataset[split]

    def tokenize_captions(examples):
        captions = list(examples["label"])
        text_inputs = tokenizer(captions, padding="max_length", truncation=True)
        examples["input_ids"] = text_inputs.input_ids
        examples["attention_mask"] = text_inputs.attention_mask
        return examples

    def transform_images(examples):
        images = [read_image(image_file, mode=ImageReadMode.RGB) for image_file in examples["image"]]
        examples["pixel_values"] = [image_transformations(image) for image in images]
        return examples

    data = data.map(
        function=tokenize_captions,
        batched=True,
        remove_columns=["image", "label"],
        desc=f"Running tokenizer on {split} dataset",
    )

    data.set_transform(transform_images)
    return data

dataset = preprocess_dataset(dataset, "train")
dataset = preprocess_dataset(dataset, "val")
dataset = preprocess_dataset(dataset, "test")

# Collate function
def collate_fn(examples):
    pixel_values = torch.stack([torch.tensor(example["pixel_values"]) for example in examples])
    bboxes = torch.tensor([example["bbox"] for example in examples], dtype=torch.float)
    labels = torch.tensor([example["label"] for example in examples], dtype=torch.long)
    return {"pixel_values": pixel_values, "labels": {"bbox": bboxes, "label": labels}}

KeyError: "Column train not in the dataset. Current columns in the dataset: ['bbox', 'input_ids', 'attention_mask']"

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    learning_rate=5e-5,
    warmup_steps=0,
    weight_decay=0.1,
    per_device_train_batch_size=16,
    logging_steps=5,
    save_steps=5,
    remove_unused_columns=False,
    output_dir="clip-finetune",
    report_to='none',  # disable wandb
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    data_collator=collate_fn,
)

# Train the model
train_result = trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

In [31]:
# Training arguments
training_args = TrainingArguments(
    learning_rate=5e-5,
    warmup_steps=0,
    weight_decay=0.1,
    per_device_train_batch_size=16,
    logging_steps=5,
    save_steps=5,
    output_dir="owlv2-finetune",
    report_to='none',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
)

In [32]:
# Train the model
train_result = trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

# Save the model
trainer.save_model("owlv2-finetune")
processor.save_pretrained("owlv2-finetune")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



KeyError: 'bbox'