In [None]:
pip install transformers

Step 1 – Imports

In [None]:
# Step 1 - Imports

import os                      # work with folders and file paths
import pickle                  # save and load Python dictionaries
import shutil                  # remove folders such as .ipynb_checkpoints

import torch                   # main deep learning framework (PyTorch)
from PIL import Image          # load and handle images

from transformers import AutoImageProcessor, Dinov2Model  # DINOv2 image processor and model


Step 2 – Load DINOv2 ViT-L/14 model

In [None]:
# Step 2 - Load DINOv2 ViT-L/14 model

dinov2_model_name = "facebook/dinov2-large"                 # name of the DINOv2 ViT-L/14 checkpoint
image_processor = AutoImageProcessor.from_pretrained(dinov2_model_name)  # create image preprocessor for this model
dinov2_model = Dinov2Model.from_pretrained(dinov2_model_name)            # load pretrained DINOv2 model weights
dinov2_model.eval()                                          # set model to evaluation mode (no training)


Step 3 – Show model summary

In [None]:
# Step 3 - Show DINOv2 ViT-g model summary

print(dinov2_model)  # print the architecture and layers of the DINOv2 giant model

Step 4 – Prepare feature storage and image directory

In [None]:
# Step 4 - Prepare feature storage and image directory

features = {}                                              # dictionary to store feature vectors for each image
directory = r"test B- front masked images - side"          # path to the folder that contains the input images


Step 5 – Remove .ipynb_checkpoints folder

In [None]:
# Step 5 - Remove any '.ipynb_checkpoints' folders inside the given folder

folder = "test B- front masked images - side"                      # folder where we want to remove checkpoint subfolders

for item in os.listdir(folder):                            # loop through all items in the folder
    path = os.path.join(folder, item)                      # build the full path of the current item
    if item == ".ipynb_checkpoints" and os.path.isdir(path):  # check if the item is the checkpoints folder
        shutil.rmtree(path)                                # delete the checkpoint folder and its contents
        print("Removed:", path)                            # print which folder was removed


Step 6 – Extract features with DINOv2 ViT-L/14

In [None]:
# Step 6 - Loop through images, preprocess with DINOv2 ViT-L/14, and extract features

valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".gif"}  # allowed image file extensions

file_list = os.listdir(directory)                             # list all items in the image directory
print(f"Found {len(file_list)} items in directory: {directory}")  # print how many items were found

for idx, image_name in enumerate(file_list, start=1):         # loop over items with an index starting from 1
    image_path = os.path.join(directory, image_name)          # build the full path to the current item

    if not os.path.isfile(image_path):                        # check if the current item is not a regular file
        continue                                              # skip this item and go to the next one

    ext = os.path.splitext(image_name)[1].lower()             # get the file extension in lowercase
    if ext not in valid_extensions:                           # check if the extension is not a valid image type
        continue                                              # skip non-image files

    img = Image.open(image_path).convert("RGB")               # open the image and convert it to RGB format

    inputs = image_processor(images=img, return_tensors="pt") # preprocess the image and get PyTorch tensors

    with torch.no_grad():                                     # disable gradient calculation for inference
        outputs = dinov2_model(**inputs)                      # run the image through the DINOv2 model
        cls_embedding = outputs.last_hidden_state[:, 0, :]    # take the [CLS] token embedding as global feature

    feature_vector = cls_embedding.cpu().numpy()              # move tensor to CPU and convert to NumPy array

    features[image_path] = feature_vector                     # store the feature vector using full path as the key

    print(f"Processed {idx}/{len(file_list)}: {image_name}")  # print progress for the current image


In [None]:
feature_vector.shape

Step 7 – Rename keys to filenames only and save as a pickle file

In [None]:
# Step 7 - Rename dictionary keys to filenames only and save to a pickle file

features_renamed = {}                                       # create a new dictionary with filename-only keys

for full_path, vec in features.items():                     # loop over each entry in the original features dictionary
    filename = os.path.basename(full_path)                  # extract the filename from the full path
    features_renamed[filename] = vec                        # store the feature vector under the filename key

features = features_renamed                                 # replace the original dictionary with the renamed one

print("Sample keys after rename:")                          # print some example keys to check
for i, k in enumerate(features.keys()):                     # loop over dictionary keys with an index
    print(k)                                                # print the current filename key
    if i == 4:                                              # stop after printing five keys
        break                                               # exit the loop

pickle_path = "dinov2_large_side_masked_features-testB.pkl" # name of the output pickle file

with open(pickle_path, "wb") as f:                          # open the pickle file in binary write mode
    pickle.dump(features, f)                                # save the features dictionary to the pickle file

print(f"Saved {len(features)} feature vectors to {pickle_path}")  # print how many entries were saved


Step 8 – Compare image filenames and dictionary keys

In [None]:
# Step 8 - Compare image filenames in the directory with keys in the features dictionary

image_files_in_dir = []                                      # list to store image filenames found in the directory

for name in os.listdir(directory):                           # loop over all items in the directory
    full_path = os.path.join(directory, name)                # build full path for the current item
    ext = os.path.splitext(name)[1].lower()                  # get the file extension in lowercase
    if os.path.isfile(full_path) and ext in valid_extensions:  # keep only valid image files
        image_files_in_dir.append(name)                      # add the image filename to the list

files_set = set(image_files_in_dir)                          # convert the list of image filenames to a set
keys_set = set(features.keys())                              # convert the feature dictionary keys to a set

files_not_in_dict = files_set - keys_set                     # files present in folder but missing in dictionary
keys_not_in_folder = keys_set - files_set                    # keys in dictionary without corresponding files

print(f"Number of image files in directory: {len(files_set)}")   # print number of image files
print(f"Number of keys in features dictionary: {len(keys_set)}\n")  # print number of feature entries

if not files_not_in_dict and not keys_not_in_folder:         # check if there are no mismatches
    print("✅ All image filenames and dictionary keys MATCH exactly.")  # print success message
else:                                                        # execute this if mismatches exist
    print("⚠ Some mismatches were found:\n")                 # warn about mismatches

    if files_not_in_dict:                                    # check if any files lack entries in the dictionary
        print(f"Files in directory but NOT in dictionary ({len(files_not_in_dict)}):")  # header for missing entries
        for i, name in enumerate(sorted(files_not_in_dict)): # loop over some of the missing filenames
            print("  -", name)                               # print the filename
            if i == 9:                                       # stop after 10 examples
                break                                        # exit the loop
        print()                                              # print an empty line for readability

    if keys_not_in_folder:                                   # check if any dictionary keys lack files
        print(f"Keys in dictionary but NO corresponding file in directory ({len(keys_not_in_folder)}):")  # header
        for i, name in enumerate(sorted(keys_not_in_folder)):  # loop over some of the extra keys
            print("  -", name)                               # print the key
            if i == 9:                                       # stop after 10 examples
                break                                        # exit the loop
