# Aninmal Shelter - Dog Classification

In [41]:
# Imports
import pandas as pd
import os
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

## Data Preparation

In [42]:
labels_path = 'data/labels.csv'
labels=pd.read_csv(labels_path)

In [43]:
# List of filenames
image_dir = 'data/train/'
filenames = [image_dir + fname + '.jpg' for fname in labels['id']]

# Check if any files do not exist
missing_files = [fname for fname in filenames if not os.path.isfile(fname)]
print(f"Missing files: {len(missing_files)}")

Missing files: 0


In [44]:
# Set variables
X = filenames
y = labels['breed']

In [45]:
# One Hot Encode
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(labels[['breed']])
encoding_labels = encoder.categories_[0]

In [46]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Subset for building
X_train_subset = X_train[:1000]
X_val_subset = X_val[:1000]

## Preprocessing Images

In [47]:
# Parameters for image and batch size
IMG_SIZE = 224
BATCH_SIZE = 32

# PyTorch image transforms - creates a preprocessing pipeline
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)), #resizes all the images
    transforms.ToTensor(), #converts to tensorflow images
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # converts 0-1 to -1 to 1
])
# Get number of classes
num_classes = len(encoding_labels)

In [48]:
# PyTorch Dataset class
class DogBreedDataset(Dataset):
    #constructor
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
      #determines the length of the dataset
    def __len__(self):
        return len(self.image_paths)
    #called by PyTorch to get a specific item
    def __getitem__(self, idx):
        # Load and process image
        image = Image.open(self.image_paths[idx]).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        # Return image and label
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return image, label

In [49]:
# Convert breed names to numbers
label_encoder = LabelEncoder()
y_train_subset_encoded = label_encoder.fit_transform(y_train[:1000])
y_val_subset_encoded = label_encoder.transform(y_val[:1000])

print(f"Number of classes: {len(label_encoder.classes_)}")

Number of classes: 120


In [50]:
# Create PyTorch datasets and dataloaders
def create_data_loaders(X_train, y_train, X_val, y_val, batch_size=BATCH_SIZE):
    #Create PyTorch DataLoaders for training and validation"""
    
    # Create datasets
    train_dataset = DogBreedDataset(X_train, y_train, transform)
    val_dataset = DogBreedDataset(X_val, y_val, transform)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    return train_loader, val_loader, train_dataset, val_dataset

In [None]:
# Create PyTorch DataLoaders

# Create training and validation data loaders
train_loader, val_loader, train_dataset, val_dataset = create_data_loaders(
    X_train_subset, 
    y_train_subset_encoded, 
    X_val_subset, 
    y_val_subset_encoded,
    batch_size=BATCH_SIZE
)

# Verify everything is working
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Number of classes: {num_classes}")
print(f"Batch size: {BATCH_SIZE}")

Training samples: 1000
Validation samples: 1000
Number of classes: 120
Batch size: 32
