In [8]:
# This is the first cell, where image_path is defined
from pathlib import Path

# Define the path to the data folder
data_path = Path("data/")
image_path = data_path / "fgvc_aircraft"

In [9]:
import os
import sys
import zipfile
import requests

# 1️⃣ Mount Google Drive (if using for storage)
use_gdrive = False  # Set to True if dataset is stored in Google Drive
if use_gdrive:
    from google.colab import drive
    drive.mount('/content/drive')

# 2️⃣ Clone your GitHub repo if it's not already present
repo_url = "https://github.com/jmand626/PyTorchMLEngine-Custom-Dataset-Project.git"
repo_name = "PyTorchMLEngine-Custom-Dataset-Project"

if not os.path.exists(repo_name):
    print(f"Cloning {repo_url}...")
    !git clone {repo_url}
else:
    print(f"Repository {repo_name} already exists.")

# 3️⃣ Change to repo directory ONLY ONCE
os.chdir(repo_name) # This line sets the working directory

# 4️⃣ Add project files to sys.path so imports work
sys.path.append(os.getcwd())
print("Project directory added to sys.path")

# 5️⃣ Ensure necessary dependencies are installed
try:
    import torchinfo
except ImportError:
    print("Installing torchinfo...")
    !pip install -q torchinfo

# 6️⃣ Download FGVC Aircraft dataset if missing
dataset_url = "https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz"
dataset_tar = data_path / "fgvc-aircraft-2013b.tar.gz"
dataset_folder = data_path / "fgvc-aircraft-2013b"

if dataset_folder.exists():
    print("Dataset already exists.")
else:
    print("Downloading FGVC Aircraft dataset...")
    data_path.mkdir(parents=True, exist_ok=True)
    response = requests.get(dataset_url, stream=True)
    with open(dataset_tar, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Extracting dataset...")
    !tar -xzf {dataset_tar} -C {data_path}
    os.remove(dataset_tar)
    print("Dataset extraction complete.")

Cloning https://github.com/jmand626/PyTorchMLEngine-Custom-Dataset-Project.git...


Cloning into 'PyTorchMLEngine-Custom-Dataset-Project'...


Project directory added to sys.path
Downloading FGVC Aircraft dataset...
Extracting dataset...
Dataset extraction complete.


In [10]:
# For this notebook to run with updated APIs, we need torch 1.12+ and torchvision 0.13+
try:
    import torch
    import torchvision
    assert int(torch.__version__.split(".")[1]) >= 12, "torch version should be 1.12+"
    assert int(torchvision.__version__.split(".")[1]) >= 13, "torchvision version should be 0.13+"
    print(f"torch version: {torch.__version__}")
    print(f"torchvision version: {torchvision.__version__}")
except:
    print(f"[INFO] torch/torchvision versions not as required, installing nightly versions.")
    !pip3 install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
    import torch
    import torchvision
    print(f"torch version: {torch.__version__}")
    print(f"torchvision version: {torchvision.__version__}")

[INFO] torch/torchvision versions not as required, installing nightly versions.
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
torch version: 2.5.1+cpu
torchvision version: 0.20.1+cpu



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
%cd PyTorchMLEngine-Custom-Dataset-Project
!ls

[WinError 2] The system cannot find the file specified: 'PyTorchMLEngine-Custom-Dataset-Project'
c:\Users\joban\Downloads


'ls' is not recognized as an internal or external command,
operable program or batch file.


In [12]:
# Setup device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

Now hopefully we can continously use the previous setup code whenever we want to use this dataset again.

In [None]:
# CORRECTED PATHS
from pathlib import Path
import os

# Print current directory to verify where we are
print(f"Current working directory: {os.getcwd()}")

# Since we already changed directory with os.chdir(repo_name) in cell 9,
# we need to use relative paths from there, NOT including "PyTorchMLEngine-Custom-Dataset-Project" again
train_dir = Path("data/fgvc-aircraft-2013b/data/images")
test_dir = Path("data/fgvc-aircraft-2013b/data/images")

# Another option is to use the eleven_group_subset dataset that exists
alternative_train_dir = Path("data/eleven_group_subset_90_percent/train")
alternative_test_dir = Path("data/eleven_group_subset_90_percent/test")

# Check if paths exist
print(f"Does fgvc path exist? {os.path.exists(train_dir)}")
print(f"Does eleven_group path exist? {os.path.exists(alternative_train_dir)}")

# List what's actually in the data directory
print("\nContents of data directory:")
for item in os.listdir("data"):
    print(f"  - {item}")

# Use the alternative paths that actually exist
train_dir = alternative_train_dir
test_dir = alternative_test_dir

print("\nFINAL PATHS TO USE:")
print(f"Train directory: {train_dir}")
print(f"Test directory: {test_dir}")

Now we continue on to creating our datasets and dataloaders. An important issue is that we have to ensure that the data that we feed into our pretrained model must be formatted in the same way as the data inputted when training the model (helps performance immeasurably). There is a certain way that all models from torchvision.models require, and we will do that.

In [32]:
import torchvision.transforms as transforms
import setup_dataholders
import importlib
importlib.reload(setup_dataholders)
manual_transforms = transforms.Compose([
    transforms.Resize((224, 224)), # 1. Reshape all images to 224x224 (though some models may require different sizes)
    transforms.ToTensor(), # 2. Turn image values to between 0 & 1
    transforms.Normalize(mean=[0.485, 0.456, 0.406], # 3. A mean of [0.485, 0.456, 0.406] (across each colour channel)
                         std=[0.229, 0.224, 0.225]) # 4. A standard deviation of [0.229, 0.224, 0.225] (across each colour channel),
])

In [33]:
# Create training and testing DataLoaders as well as get a list of class names
train_dataloader, test_dataloader, class_names = setup_dataholders.create_dataloaders(train_directory=train_dir,
                                                                               test_directory=test_dir,
                                                                               data_transforms=manual_transforms, # resize, convert images to between 0 & 1 and normalize them
                                                                               batch_size=32, # set mini-batch size to 32
                                                                               workers=4) # Fixed: removed type hint from the workers argument

train_dataloader, test_dataloader, class_names

FileNotFoundError: Couldn't find any class folder in data\fgvc-aircraft-2013b\data\images.

**ISSUES BELOW**

In [None]:
# Problems with importing files, so I had to manually add files to syspath
import sys
import os
import importlib

# Add the current working directory to sys.path
sys.path.append(os.getcwd())

# List of Python files you want to import
modules = [
    "computer_vision_test_main",
    "create_custom_dataset",
    "firsttry_model",
    "model_backbone",
    "setup_dataholders"
]

# Import each module and reload to avoid caching issues
for module in modules:
    try:
        imported_module = __import__(module)
        importlib.reload(imported_module)  # Reload in case it was modified
        print(f"Successfully imported {module}")
    except Exception as e:
        print(f"Error importing {module}: {e}")


Error importing computer_vision_test_main: No module named 'FGVC_Aircraft'
Creating image split for: train...
Error importing create_custom_dataset: [Errno 2] No such file or directory: '../data/fgvc-aircraft-2013b/data/images/fgvc-aircraft-2013b/data/images_variant_train.txt'
Successfully imported firsttry_model
Successfully imported model_backbone
Successfully imported setup_dataholders


In [None]:
import os

dataset_path = "data/fgvc-aircraft-2013b"
if os.path.exists(dataset_path):
    print("✅ Dataset folder exists.")
    print("🗂 Contents:", os.listdir(dataset_path))
else:
    print("❌ Dataset folder is missing.")


❌ Dataset folder is missing.
