In [7]:
import os
import pandas as pd
import torch
from PIL import Image, UnidentifiedImageError
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import MobileViTForImageClassification, MobileViTImageProcessor
import torch.optim as optim
from sklearn.model_selection import train_test_split


IM_SIZE = 160


train_transform = transforms.Compose([
    transforms.Resize((IM_SIZE, IM_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.RandomResizedCrop(IM_SIZE, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])


test_transform = transforms.Compose([
    transforms.Resize((IM_SIZE, IM_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])


csv_file = '/Users/johnizzo/Desktop/train.csv'  
root_dir = '/Users/johnizzo/Desktop/train_images'  

df = pd.read_csv(csv_file)
hotel_counts = df["hotel_id"].value_counts()
filtered_hotels = hotel_counts[hotel_counts >= 20].index.tolist()
filtered_df = df[df["hotel_id"].isin(filtered_hotels)]


train_df, test_df = train_test_split(filtered_df, test_size=0.1, stratify=filtered_df['hotel_id'], random_state=42)


unique_hotel_ids = sorted(train_df['hotel_id'].unique())
hotel_id_to_index = {hotel_id: idx for idx, hotel_id in enumerate(unique_hotel_ids)}
index_to_hotel_id = {idx: hotel_id for hotel_id, idx in hotel_id_to_index.items()}


train_df['class_index'] = train_df['hotel_id'].map(hotel_id_to_index)
test_df['class_index'] = test_df['hotel_id'].map(hotel_id_to_index)


In [8]:
from PIL import Image, UnidentifiedImageError, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = False  

class HotelRoomDataset(Dataset):
    def __init__(self, data, root_dir, transform=None):
        self.data = data.reset_index(drop=True)  # Reset index after filtering
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get image filename and class index
        img_name = self.data.iloc[idx, 0]
        class_index = int(self.data.iloc[idx, -1])  # Use 'class_index' column

        # Construct the full image path
        img_path = os.path.join(self.root_dir, str(self.data.iloc[idx, 1]), img_name)

        try:
            # Load and preprocess the image
            image = Image.open(img_path).convert("RGB")
            if self.transform:
                image = self.transform(image)
        except (UnidentifiedImageError, FileNotFoundError, OSError) as e:
            print(f"Warning: Skipping image {img_path}. Error: {e}")
            return None  # Return None if image loading fails

        # Return the image tensor and the class index as the label
        label = torch.tensor(class_index, dtype=torch.long)
        return image, label

# Initialize datasets with transforms
train_dataset = HotelRoomDataset(data=train_df, root_dir=root_dir, transform=train_transform)
test_dataset = HotelRoomDataset(data=test_df, root_dir=root_dir, transform=test_transform)

In [9]:
# Custom collate function to handle None values
def collate_fn(batch):
    # Filter out None values
    batch = [item for item in batch if item is not None]
    if len(batch) == 0:
        return None  # Skip if batch is empty
    images, labels = zip(*batch)
    images = torch.stack(images)
    labels = torch.stack(labels)
    return images, labels

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)

In [10]:
# Initialize the MobileViT model for image classification
num_classes = len(unique_hotel_ids)  # Set to the number of unique hotels in the dataset
model = MobileViTForImageClassification.from_pretrained(
    "apple/mobilevit-xx-small",
    num_labels=num_classes,
    ignore_mismatched_sizes=True
)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

Some weights of MobileViTForImageClassification were not initialized from the model checkpoint at apple/mobilevit-xx-small and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 320]) in the checkpoint and torch.Size([1062, 320]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([1062]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Training loop
num_epochs = 2  # Train for only 2 epochs
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for batch in train_dataloader:
        if batch is None:  # Skip empty batches
            continue

        images, labels = batch
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(pixel_values=images)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_dataloader)}")

# Save the model
torch.save(model.state_dict(), 'model_after_2_epochs.pth')
print("Model training complete and saved after 2 epochs!")

Epoch 1/2, Loss: 6.874273698418229
Epoch 2/2, Loss: 6.48531721300549
Model training complete and saved after 2 epochs!


In [6]:
import torch
from transformers import MobileViTForImageClassification

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reload the MobileViT model architecture
model = MobileViTForImageClassification.from_pretrained(
    "apple/mobilevit-xx-small",
    num_labels=num_classes,  # Ensure this matches your original setup
    ignore_mismatched_sizes=True
)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

# Load the model state saved after 17 epochs
model.load_state_dict(torch.load('model_after_17(2)_epochs.pth'))
print("Model loaded successfully from 17 epochs.")

# Reinitialize the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Continue training for 3 additional epochs
num_epochs_additional = 3  # Number of additional epochs
for epoch in range(18, 18 + num_epochs_additional):  # Continue from epoch 18
    model.train()
    running_loss = 0.0
    
    for batch in train_dataloader:
        if batch is None:  # Skip empty batches
            continue

        images, labels = batch
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(pixel_values=images)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch}/{17 + num_epochs_additional}, Loss: {running_loss / len(train_dataloader)}")

# Save the model after completing the additional epochs
torch.save(model.state_dict(), 'model_after_20(2)_epochs.pth')
print("Model saved after 20 epochs.")


Some weights of MobileViTForImageClassification were not initialized from the model checkpoint at apple/mobilevit-xx-small and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 320]) in the checkpoint and torch.Size([1062, 320]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([1062]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully from 17 epochs.
Epoch 18/20, Loss: 3.001364555844554
Epoch 19/20, Loss: 2.854254344823184
Epoch 20/20, Loss: 2.7361164776815308
Model saved after 20 epochs.


In [7]:
import torch
from transformers import MobileViTForImageClassification
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reload the MobileViT model architecture
model = MobileViTForImageClassification.from_pretrained(
    "apple/mobilevit-xx-small",
    num_labels=num_classes,  # Ensure this matches your original setup
    ignore_mismatched_sizes=True
)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

# Load the model state saved after 17 epochs
model.load_state_dict(torch.load('model_after_20(2)_epochs.pth'))


# Evaluate function to calculate test accuracy
def evaluate_test_accuracy(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_predictions = []

    with torch.no_grad():  # Disable gradient computation for evaluation
        for batch in tqdm(dataloader, desc="Evaluating Test Accuracy"):
            if batch is None:  # Skip empty batches
                continue

            images, labels = batch
            images, labels = images.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(pixel_values=images)
            predictions = torch.argmax(outputs.logits, dim=1)  # Get predicted class
            
            # Store labels and predictions
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())
    
    # Calculate accuracy
    test_accuracy = accuracy_score(all_labels, all_predictions)
    return test_accuracy

# Calculate and print test accuracy
test_accuracy = evaluate_test_accuracy(model, test_dataloader, device)
print(f"Test Set Accuracy After 20 Epochs: {test_accuracy * 100:.2f}%")


Some weights of MobileViTForImageClassification were not initialized from the model checkpoint at apple/mobilevit-xx-small and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 320]) in the checkpoint and torch.Size([1062, 320]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([1062]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully from 17 epochs.


Evaluating Test Accuracy:  58%|████████▋      | 139/240 [04:52<03:57,  2.35s/it]



Evaluating Test Accuracy:  99%|██████████████▉| 238/240 [08:15<00:03,  1.72s/it]



Evaluating Test Accuracy: 100%|███████████████| 240/240 [08:19<00:00,  2.08s/it]

Test Set Accuracy After 20 Epochs: 42.38%





In [11]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import MobileViTForImageClassification
from sklearn.metrics import accuracy_score

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reload the MobileViT model architecture
model = MobileViTForImageClassification.from_pretrained(
    "apple/mobilevit-xx-small",
    num_labels=num_classes,  # Ensure this matches your original setup
    ignore_mismatched_sizes=True
)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

# Load the model state saved after 20 epochs
model.load_state_dict(torch.load('model_after_20(2)_epochs.pth'))
print("Model loaded successfully from 20 epochs.")

def evaluate_top5_accuracy(model, dataloader, device, k=5):
    """
    Evaluates the model on the dataloader and computes Top-K Accuracy.
    """
    model.eval()  # Set the model to evaluation mode
    total_samples = 0
    correct_predictions = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for batch in tqdm(dataloader, desc=f"Evaluating Top-{k} Accuracy"):
            if batch is None:  # Skip empty batches
                continue
            
            images, labels = batch
            images, labels = images.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(pixel_values=images)
            probabilities = torch.softmax(outputs.logits, dim=1)  # Get class probabilities
            
            # Get the top-k predictions
            topk_predictions = torch.topk(probabilities, k, dim=1).indices
            
            # Check if the true labels are in the top-k predictions
            for i in range(labels.size(0)):
                if labels[i] in topk_predictions[i]:
                    correct_predictions += 1
                total_samples += 1
    
    # Calculate Top-K Accuracy
    top_k_accuracy = correct_predictions / total_samples
    return top_k_accuracy

# Calculate and print Top-5 Accuracy
top5_accuracy = evaluate_top5_accuracy(model, test_dataloader, device, k=5)
print(f"Top-5 Accuracy on Test Set After 20 Epochs: {top5_accuracy * 100:.2f}%")


Some weights of MobileViTForImageClassification were not initialized from the model checkpoint at apple/mobilevit-xx-small and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 320]) in the checkpoint and torch.Size([1062, 320]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([1062]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully from 20 epochs.


Evaluating Top-5 Accuracy:  58%|████████      | 139/240 [05:22<03:37,  2.16s/it]



Evaluating Top-5 Accuracy:  99%|█████████████▉| 238/240 [08:58<00:04,  2.17s/it]



Evaluating Top-5 Accuracy: 100%|██████████████| 240/240 [09:04<00:00,  2.27s/it]

Top-5 Accuracy on Test Set After 20 Epochs: 62.11%



