In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
from transformers import CLIPModel, CLIPProcessor
import torch.optim as optim
import torch.nn as nn
from torch.amp import autocast, GradScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define a custom dataset for property images
class PropertyImageDataset(Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(('.jpg', '.png'))]
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, img_path  # Return image and its path for reference

In [3]:
# NTXentLoss (Image-Image Contrastive Loss)
class NTXentLoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(NTXentLoss, self).__init__()
        self.temperature = temperature
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, image_features1, image_features2):
        # Normalize features to unit vectors
        image_features1 = image_features1 / image_features1.norm(dim=-1, keepdim=True)
        image_features2 = image_features2 / image_features2.norm(dim=-1, keepdim=True)

        # Calculate similarity between images (dot product / temperature)
        logits = image_features1 @ image_features2.T / self.temperature

        # Labels for contrastive loss (identity matrix for positive pairs)
        labels = torch.arange(image_features1.size(0), device=image_features1.device)
        loss = (self.criterion(logits, labels) + self.criterion(logits.T, labels)) / 2
        return loss

# Multi-View Augmentations
class MultiViewTransform:
    def __init__(self):
        self.transform = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=(0.5, 1.0)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.RandomRotation(15),
            transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
            transforms.RandomAffine(degrees=10),  # Random affine transformation
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __call__(self, x):
        return self.transform(x), self.transform(x)  # Return two different views of the same image

In [4]:
def fine_tune_clip(model, dataloader, epochs=10, lr=3e-4):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    criterion = NTXentLoss(temperature=0.07)
    scaler = GradScaler()  # Updated: Use `torch.amp.GradScaler`

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        print(f"Starting epoch {epoch + 1}/{epochs}")
        epoch_loss = 0
        
        for step, (images, img_paths) in enumerate(dataloader):
            print(f"Processing batch {step + 1}/{len(dataloader)}")
            image1, image2 = images  # Two views of the same image
            image1 = image1.to(device)
            image2 = image2.to(device)

            # Mixed precision training with autocast
            with autocast(device_type=device.type):  # Corrected: specify device type
                image_features1 = model.get_image_features(pixel_values=image1)
                image_features2 = model.get_image_features(pixel_values=image2)
                loss = criterion(image_features1, image_features2)

            # Backward pass and optimizer step with gradient scaling
            scaler.scale(loss).backward()

            scaler.unscale_(optimizer)
            scaler.step(optimizer)
            scaler.update()

            epoch_loss += loss.item()

            print(f"Batch {step + 1} processed, Loss: {loss.item():.4f}")

        scheduler.step()
        print(f"Epoch [{epoch + 1}/{epochs}] completed. Average Loss: {epoch_loss / len(dataloader):.4f}")
    
    return model

In [5]:
# Load CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Freeze text encoder if not using text
for param in model.text_model.parameters():
    param.requires_grad = False

In [6]:
# Set up dataset and dataloader with multi-view augmentations
transform = MultiViewTransform()
image_folder = r"C:\images"
dataset = PropertyImageDataset(image_folder=image_folder, transform=transform)
# Diagnostic check for the dataset size
print(f"Total images in dataset: {len(dataset)}")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)  # Disable multiprocessing

Total images in dataset: 8158


In [7]:
# Fine-tune the model
fine_tuned_clip_model = fine_tune_clip(model, dataloader, epochs=10)

Starting epoch 1/10
Processing batch 1/255


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Batch 1 processed, Loss: 2.1096
Processing batch 2/255
Batch 2 processed, Loss: 2.2906
Processing batch 3/255
Batch 3 processed, Loss: 2.2533
Processing batch 4/255
Batch 4 processed, Loss: 2.2915
Processing batch 5/255
Batch 5 processed, Loss: 2.2968
Processing batch 6/255
Batch 6 processed, Loss: 1.8187
Processing batch 7/255
Batch 7 processed, Loss: 2.3380
Processing batch 8/255
Batch 8 processed, Loss: 2.0421
Processing batch 9/255
Batch 9 processed, Loss: 2.1139
Processing batch 10/255
Batch 10 processed, Loss: 2.1644
Processing batch 11/255
Batch 11 processed, Loss: 1.9375
Processing batch 12/255
Batch 12 processed, Loss: 2.0076
Processing batch 13/255
Batch 13 processed, Loss: 2.0700
Processing batch 14/255
Batch 14 processed, Loss: 2.0976
Processing batch 15/255
Batch 15 processed, Loss: 2.4168
Processing batch 16/255
Batch 16 processed, Loss: 2.1070
Processing batch 17/255
Batch 17 processed, Loss: 2.1519
Processing batch 18/255
Batch 18 processed, Loss: 2.1629
Processing batc



Processing batch 1/255
Batch 1 processed, Loss: 1.9737
Processing batch 2/255
Batch 2 processed, Loss: 2.0017
Processing batch 3/255
Batch 3 processed, Loss: 2.4881
Processing batch 4/255
Batch 4 processed, Loss: 2.1524
Processing batch 5/255
Batch 5 processed, Loss: 2.0699
Processing batch 6/255
Batch 6 processed, Loss: 2.1226
Processing batch 7/255
Batch 7 processed, Loss: 1.8880
Processing batch 8/255
Batch 8 processed, Loss: 2.2188
Processing batch 9/255
Batch 9 processed, Loss: 2.2384
Processing batch 10/255
Batch 10 processed, Loss: 2.3859
Processing batch 11/255
Batch 11 processed, Loss: 2.2256
Processing batch 12/255
Batch 12 processed, Loss: 2.1694
Processing batch 13/255
Batch 13 processed, Loss: 2.1139
Processing batch 14/255
Batch 14 processed, Loss: 2.0468
Processing batch 15/255
Batch 15 processed, Loss: 2.0536
Processing batch 16/255
Batch 16 processed, Loss: 2.2552
Processing batch 17/255
Batch 17 processed, Loss: 2.3385
Processing batch 18/255
Batch 18 processed, Loss:

In [9]:
# Save the fine-tuned model
torch.save(fine_tuned_clip_model.state_dict(), "clip_finetuned1.pth")
print("Fine-tuning complete. Model saved as 'clip_finetuned1.pth'.")

Fine-tuning complete. Model saved as 'clip_finetuned1.pth'.
