# Intro
Inference notebook for [Hotel-ID starter - classification - traning](https://www.kaggle.com/code/michaln/hotel-id-starter-classification-traning)



# Setup

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

# Imports

In [None]:
import numpy as np
import pandas as pd
import random
import os
import math

In [None]:
from PIL import Image as pil_image
from tqdm import tqdm

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
!pip install dinov2  # if available

In [None]:
# Setup & Imports
import sys
import os
import random
import math
import numpy as np
import pandas as pd
import cv2

import timm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from PIL import Image as pil_image
from tqdm import tqdm

import albumentations as A
import albumentations.pytorch as APT
import torchvision.transforms as T

# For reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 42
IMG_SIZE = 256
seed_everything(SEED)



# Global

In [None]:
SEED = 42
IMG_SIZE = 256

PROJECT_FOLDER = "../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/"
TEST_DATA_FOLDER = PROJECT_FOLDER + "test_images/"

In [None]:
print(os.listdir(PROJECT_FOLDER))

In [None]:
# Base transformation for inference (using Albumentations)
base_transform = A.Compose([
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])


In [None]:
def pad_image(img):
    w, h, c = np.shape(img)
    if w > h:
        pad = int((w - h) / 2)
        img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=0)
    else:
        pad = int((h - w) / 2)
        img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=0)
    return img

def open_and_preprocess_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = pad_image(img)
    return cv2.resize(img, (IMG_SIZE, IMG_SIZE))


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# Dataset and transformations

In [None]:
import albumentations as A
import albumentations.pytorch as APT
import cv2 

IMG_SIZE = 256

# used for training dataset - augmentations and occlusions
train_transform = A.Compose([
    A.HorizontalFlip(p=0.75),
    A.VerticalFlip(p=0.25),
    A.ShiftScaleRotate(p=0.5, border_mode=cv2.BORDER_CONSTANT),
    A.OpticalDistortion(p=0.25),
    A.Perspective(p=0.25),
    A.CoarseDropout(p=0.5, min_holes=1, max_holes=6, 
                    min_height=IMG_SIZE//16, max_height=IMG_SIZE//4,
                    min_width=IMG_SIZE//16,  max_width=IMG_SIZE//4), # normal coarse dropout
    
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions in test data

    A.RandomBrightnessContrast(p=0.75),
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# used for validation dataset - only occlusions
val_transform = A.Compose([
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

In [None]:
def pad_image(img):
    w, h, c = np.shape(img)
    if w > h:
        pad = int((w - h) / 2)
        img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=0)
    else:
        pad = int((h - w) / 2)
        img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=0)
        
    return img


def open_and_preprocess_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = pad_image(img)
    return cv2.resize(img, (IMG_SIZE, IMG_SIZE))

In [None]:
class HotelImageDataset:
    def __init__(self, data, transform=None, data_folder="train_images/"):
        self.data = data
        self.data_folder = data_folder
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        image_path = os.path.join(self.data_folder, record["image_id"])
        
        image = np.array(open_and_preprocess_image(image_path)).astype(np.uint8)
        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]
        return {"image": image}


In [None]:
class HotelImageDataset:
    def __init__(self, data, transform=None, data_folder="train_images/"):
        self.data = data
        self.data_folder = data_folder
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        image_path = self.data_folder + record["image_id"]
        
        image = np.array(open_and_preprocess_image(image_path)).astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]
        
        return {
            "image" : image,
        }

In [None]:
class HotelIdDINOModel(nn.Module):
    def __init__(self, n_classes, backbone_name="dinov2_vitl14"):
        """
        n_classes: Number of output classes (hotel IDs)
        backbone_name: Name of the DINOv2 model as registered in timm
        """
        super(HotelIdDINOModel, self).__init__()
        # Create the backbone with pretrained weights
        self.backbone = timm.create_model(backbone_name, pretrained=True)
        # Remove the default classifier (if present)
        self.backbone.reset_classifier(0)
        # Attach a new classifier
        self.classifier = nn.Linear(self.backbone.num_features, n_classes)
    
    def forward(self, x):
        features = self.backbone(x)
        return self.classifier(features)


# Model

In [None]:
def predict_tta(loader, model, n_matches=5, tta_transforms=3):
    """
    Perform Test-Time Augmentation (TTA) and return predictions.
    
    Parameters:
        - loader: DataLoader containing test images
        - model: Trained PyTorch model
        - n_matches: Number of top predictions to return per image
        - tta_transforms: Number of augmentations per image
    
    Returns:
        - preds: TTA-averaged top predictions (indices)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    preds = []
    
    with torch.no_grad():
        for sample in loader:
            input_images = sample["image"].to(device)
            tta_outputs = []
            
            # Inference on original images (weighted more)
            original_output = model(input_images)
            tta_outputs.append(1.5 * torch.sigmoid(original_output).cpu().numpy())
            
            # Define a list of augmentation functions
            transform_list = [
                lambda x: torch.flip(x, dims=[-1]),  # Horizontal Flip
                lambda x: T.ColorJitter(brightness=0.1, contrast=0.1)(x),  # Color Jitter
                lambda x: T.RandomAffine(degrees=5, translate=(0.05, 0.05))(x),  # Affine transform
                lambda x: T.GaussianBlur(kernel_size=3, sigma=(0.1, 1.0))(x)  # Gaussian Blur
            ]
            
            for _ in range(tta_transforms):
                aug_fn = np.random.choice(transform_list)
                aug_input = aug_fn(input_images)
                outputs = model(aug_input)
                tta_outputs.append(torch.sigmoid(outputs).cpu().numpy())
            
            # Average predictions and get top classes
            avg_outputs = np.mean(tta_outputs, axis=0)
            preds.extend(np.argsort(-avg_outputs, axis=1)[:, :n_matches])
    
    return np.array(preds)


In [None]:
# Create a DataFrame of test images
test_df = pd.DataFrame({
    "image_id": os.listdir(TEST_DATA_FOLDER),
    "hotel_id": ""
}).sort_values(by="image_id")

# Instantiate the dataset and dataloader
test_dataset = HotelImageDataset(test_df, transform=base_transform, data_folder=TEST_DATA_FOLDER)
batch_size = 64
num_workers = 2
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)


In [None]:
# Create a DataFrame of test images
test_df = pd.DataFrame({
    "image_id": os.listdir(TEST_DATA_FOLDER),
    "hotel_id": ""
}).sort_values(by="image_id")

# Instantiate the dataset and dataloader
test_dataset = HotelImageDataset(test_df, transform=base_transform, data_folder=TEST_DATA_FOLDER)
batch_size = 64
num_workers = 2
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)


In [None]:
class HotelIdModel(nn.Module):
    def __init__(self, n_classes=100, backbone_name="resnet34"):
        super(HotelIdModel, self).__init__()
        
        self.backbone = timm.create_model(backbone_name, num_classes=n_classes, pretrained=False)

    def forward(self, x):
        return self.backbone(x)

# Model helper functions

In [None]:
import torch
import numpy as np
import torchvision.transforms as T

def predict_tta(loader, model, n_matches=5, tta_transforms=3):
    """
    Perform Test-Time Augmentation (TTA) and return predictions.
    
    Parameters:
        - loader: DataLoader containing test images
        - model: Trained PyTorch model
        - n_matches: Number of top predictions to return
        - tta_transforms: Number of augmentations per image
    
    Returns:
        - preds: TTA-averaged predictions
    """
    # Automatically detect device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    model.eval()
    preds = []

    with torch.no_grad():
        for sample in loader:
            input_images = sample['image'].to(device)

            tta_outputs = []
            
            # Original image inference (weighted more)
            original_output = model(input_images)
            tta_outputs.append(1.5 * torch.sigmoid(original_output).cpu().numpy())  # Weighting original image

            # Define transformations for TTA
            transform_list = [
                lambda x: torch.flip(x, dims=[-1]),  # Horizontal Flip
                lambda x: T.ColorJitter(brightness=0.1, contrast=0.1)(x),  # Mild brightness/contrast
                lambda x: T.RandomAffine(degrees=5, translate=(0.05, 0.05))(x),  # Small rotations/translations
                lambda x: T.GaussianBlur(kernel_size=3, sigma=(0.1, 1.0))(x)  # Mild blur
            ]
            
            for _ in range(tta_transforms):
                aug_fn = np.random.choice(transform_list)  # Randomly choose one augmentation
                aug_input = aug_fn(input_images)
                outputs = model(aug_input)
                tta_outputs.append(torch.sigmoid(outputs).cpu().numpy())

            # Average multiple augmented predictions
            avg_outputs = np.mean(tta_outputs, axis=0)
            preds.extend(avg_outputs)

    # Get top predictions
    preds = np.argsort(-np.array(preds), axis=1)[:, :n_matches]
    return preds



In [None]:
import albumentations as A
import albumentations.pytorch as APT
import cv2 

# used for training dataset - augmentations and occlusions
train_transform = A.Compose([
    A.RandomCrop(width=64, height=64),
    A.HorizontalFlip(p=0.75),
    #A.VerticalFlip(p=0.0),
    A.ShiftScaleRotate(p=0.5, shift_limit=0.0625, scale_limit=0.1, rotate_limit=10, interpolation=cv2.INTER_NEAREST, border_mode=cv2.BORDER_CONSTANT),
    A.OpticalDistortion(p=0.25, distort_limit=0.05, shift_limit=0.01),
    A.Perspective(p=0.25, scale=(0.05, 0.1)),
    A.ColorJitter(p=0.75, brightness=0.2, contrast=0.2, saturation=0.1, hue=0.05),
    A.CoarseDropout(p=0.5, min_holes=1, max_holes=5, 
                    min_height=IMG_SIZE//16, max_height=IMG_SIZE//8,
                    min_width=IMG_SIZE//16,  max_width=IMG_SIZE//8), # normal coarse dropout
    
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions in test data
    #A.RandomBrightnessContrast(p=0.75),
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# used for validation dataset - only occlusions
val_transform = A.Compose([
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# no augmentations
base_transform = A.Compose([
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# Prepare data

In [None]:
test_df = pd.DataFrame(data={"image_id": os.listdir(TEST_DATA_FOLDER), "hotel_id": ""}).sort_values(by="image_id")

In [None]:
# code hotel_id mapping created in training notebook by encoding hotel_ids
hotel_id_code_df = pd.read_csv('../input/resnet-training/hotel_id_code_mapping.csv')
hotel_id_code_map = hotel_id_code_df.set_index('hotel_id_code').to_dict()["hotel_id"]

# Prepare model

In [None]:
def get_model(model_type, backbone_name, checkpoint_path, args):
    model = HotelIdModel(args.n_classes, backbone_name)
        
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["model"])
    model = model.to(args.device)
    
    return model

In [None]:
class args:
    batch_size = 64
    num_workers = 2
    n_classes = hotel_id_code_df["hotel_id"].nunique()
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    
    
seed_everything(seed=SEED)

test_dataset = HotelImageDataset(test_df, base_transform, data_folder=TEST_DATA_FOLDER)
test_loader = DataLoader(test_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

In [None]:
import torch

def get_model(model_type, backbone_name, checkpoint_path, args):
    model = HotelIdModel(args.n_classes, backbone_name)
    
    # Load the checkpoint with map_location
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))  
    
    model.load_state_dict(checkpoint["model"])
    model = model.to(args.device)  # Ensure it's moved to the correct device (CPU/GPU)
    
    return model

In [None]:
model = get_model("classification", "resnet34", 
                  "../input/resnet-training/checkpoint-classification-model-resnet34-256x256.pt", 
                  args)

print(model)


# Submission

In [None]:
%%time

preds = predict_tta(test_loader, model, n_matches=5, tta_transforms=3)
# replace classes with hotel_id using mapping created in trainig notebook
preds = [[hotel_id_code_map[b] for b in a] for a in preds]
# transform array of hotel_ids into string
test_df["hotel_id"] = [str(list(l)).strip("[]").replace(",", "") for l in preds]

test_df.to_csv("submission.csv", index=False)
test_df.head()