In [1]:
import pandas as pd
from tqdm import tqdm
from natsort import natsorted
import os, json, open_clip, torch
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

import os
import json
import torch
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from natsort import natsorted
import pandas as pd
from tqdm import tqdm
import open_clip
from datetime import datetime

In [2]:
# 0. Settings
device = 'cuda:0'
model_name = 'ViT-L-14'
pretrained = 'openai'
root = './'
dataset_name = 'Scene'

# 1. Load CLIP model
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
tokenizer = open_clip.get_tokenizer(model_name)

# 2. Load test dataset with data augmentation
ds = ImageFolder(os.path.join(root, dataset_name), transform=preprocess)
ds.samples = natsorted(ds.samples)
dl = DataLoader(ds, shuffle=False, batch_size=32, num_workers=2)

# 3. Load class name list
with open(os.path.join(root, 'classes.json'), 'r') as j:
    class_names = json.loads(j.read())

# Enhanced text prompts with multiple short descriptions
class_descriptions = {
    "Buildings": [
        "A photo of buildings.",
        "Buildings in the city.",
        "Modern architecture.",
        "Tall skyscrapers in the downtown area.",
        "Old historic buildings with intricate designs.",
        "Residential apartments.",
        "Office buildings with glass facades.",
        "Industrial buildings with large chimneys.",
        "Shopping malls bustling with people.",
        "Modern skyscrapers reflecting the sunlight.",
        "Government buildings with majestic architectures.",
        "Abandoned buildings with broken windows.",
        "Hotels with luxurious designs.",
        "Universities with large campuses.",
        "Hospitals with emergency entrances.",
        "Libraries filled with books.",
        "Museums showcasing art pieces.",
        "Theaters with grand entrances.",
        "High-rise buildings touching the clouds.",
        "Business complexes with conference rooms."
    ],
    "Forests": [
        "A photo of a forest.",
        "Dense forest area.",
        "Trees and plants.",
        "Lush green forests in the rainy season.",
        "Autumn forests with colorful leaves.",
        "Pine forests in the mountains.",
        "Tropical forests with dense vegetation.",
        "Rainforests with a variety of wildlife.",
        "Coniferous forests with tall trees.",
        "Deciduous forests shedding leaves.",
        "Foggy forests in the early morning.",
        "Forests with a carpet of fallen leaves.",
        "Sunlight filtering through forest trees.",
        "Forests with a variety of flora and fauna.",
        "Snow-covered forests in winter.",
        "Forests along a river.",
        "Dense forests with narrow pathways.",
        "Forests with ancient trees.",
        "Forests with birds chirping."
    ],
    "Glacier": [
        "A photo of a glacier.",
        "Large ice formation.",
        "Snowy glacier.",
        "Melting glacier under the sun.",
        "Blue ice glacier in the polar region.",
        "Glacier with ice caves.",
        "Frozen glacier with cracks.",
        "Glacier with snow-capped peaks.",
        "Glacier in a mountainous region.",
        "Glacier with icy surface.",
        "Glacier flowing into the sea.",
        "Glacier with frozen rivers.",
        "Glacier with visible crevasses.",
        "Glacier surrounded by rocky terrain.",
        "Glacier under the northern lights.",
        "Glacier in a remote location.",
        "Glacier with icebergs.",
        "Glacier with a smooth surface.",
        "Glacier with icy cliffs."
    ],
    "Mountains": [
        "A photo of mountains.",
        "Rocky mountains.",
        "High mountain peaks.",
        "Snow-capped mountains.",
        "Mountains with green valleys.",
        "Mountains during sunrise.",
        "Mountains with steep cliffs.",
        "Mountain range extending into the horizon.",
        "Mountains with winding roads.",
        "Mountains with a lake at the base.",
        "Mountains in a national park.",
        "Mountains with waterfalls.",
        "Mountains with dense forests.",
        "Mountains in the autumn season.",
        "Mountains with clear blue skies.",
        "Mountains with hiking trails.",
        "Mountains during sunset.",
        "Mountains with wildflowers.",
        "Mountains with jagged peaks."
    ],
    "Sea": [
        "A photo of the sea.",
        "Ocean waves.",
        "Coastal sea view.",
        "Calm sea during sunrise.",
        "Stormy sea with high waves.",
        "Sea with a sandy beach.",
        "Sea with rocky cliffs.",
        "Sea with clear blue water.",
        "Sea with a coral reef.",
        "Sea with sailing boats.",
        "Sea with a sunset view.",
        "Sea with an island in the distance.",
        "Sea with a lighthouse.",
        "Sea with seagulls flying.",
        "Sea with a fishing boat.",
        "Sea with surfers riding waves.",
        "Sea with a pier.",
        "Sea with coastal vegetation.",
        "Sea with a horizon line."
    ],
    "Street": [
        "A photo of a street.",
        "Busy city street.",
        "Quiet residential street.",
        "Street with streetlights.",
        "Street with parked cars.",
        "Street with pedestrians walking.",
        "Street in a shopping district.",
        "Street with outdoor cafes.",
        "Street with high-rise buildings.",
        "Street with trees lining the sides.",
        "Street with bike lanes.",
        "Street during rush hour.",
        "Street with a tram line.",
        "Street with historical buildings.",
        "Street in a small town.",
        "Street during the night.",
        "Street with festive decorations.",
        "Street with cobblestones.",
        "Street with colorful houses."
    ]
}

# Generate detailed prompts
prompts = []
for class_name in class_names:
    prompts.extend(class_descriptions[class_name])

# Perform zero-shot classification
zero_shot_top1 = 0
submission = dict({'id_idx': list(range(len(ds))), 'label': []})

with torch.no_grad(), torch.cuda.amp.autocast():
    text = tokenizer(prompts)
    text_features = model.encode_text(text).to('cpu')
    text_features /= text_features.norm(dim=-1, keepdim=True)

model = model.to(device)

for x, y in tqdm(dl):
    x = x.cuda(device)
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model.encode_image(x).to('cpu').float()
        image_features /= image_features.norm(dim=-1, keepdim=True)
        zero_shot_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        zero_shot_pred = zero_shot_probs.argmax(dim=-1).tolist()
        # Map predictions to class indices 0-5
        zero_shot_pred_mapped = [pred // 20 for pred in zero_shot_pred]
        submission['label'] += zero_shot_pred_mapped

# 5. Save prediction as submission.scv file.
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f'submission_{current_time}.csv'
pd.DataFrame(submission).to_csv(os.path.join(root+"submissions/", file_name), index=False)

100%|█████████████████████████████████████████| 254/254 [00:53<00:00,  4.78it/s]
