In this tutorial, we will introduce how to use MMPretrain for a multi-label classification task. <br>
This approach is suitable for scenarios where an image may belong to two or more categories. <br>
However, if your dataset includes bounding box annotations, it is recommended to use object detection algorithms instead. <br>

In [1]:
cd mmpretrain

/home/z890/Downloads/ml_sample/classification/mm_Resnet/mmpretrain


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


# Load a model

In [2]:
from mmpretrain.apis import list_models
from mmpretrain.apis import get_model
from mmpretrain.apis import inference_model

In [3]:
list_models(task='Multi-Label Classification')

['resnet101-csra_1xb16_voc07-448px']

In [4]:
#there is only one
model_name = 'resnet101-csra_1xb16_voc07-448px'
model = get_model(model_name, pretrained=True, device='cuda')

Loads checkpoint by http backend from path: https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth


In [5]:
from mmengine.fileio import get
import torch
from PIL import Image
import numpy as np

# Set model to evaluation mode
model.eval()

# Load the image
image_path = 'demo/cat-dog.png' # replace with your image
with open(image_path, 'rb') as f:
    img = Image.open(f).convert('RGB')

# Preprocessing (resize to 448px as indicated in the model name)
from torchvision import transforms
transform = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor(),
])
img_tensor = transform(img).unsqueeze(0).to('cuda')  # Add batch dimension and move to GPU

# Run inference
with torch.no_grad():
    result = model(img_tensor, mode='predict')

# Process results
# The model outputs probabilities for each class
scores = result[0].pred_score.cpu().numpy()

# You can set a threshold to determine positive predictions
threshold = 0.5
predicted_labels = np.where(scores > threshold)[0] # Get indices of classes above threshold

# Get class names (if available in the model)
class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
predicted_class_names = [class_names[idx] for idx in predicted_labels]
print("Predicted classes:", predicted_class_names)
scores = [scores[idx] for idx in predicted_labels]
print("Prediction scores:", scores)

Predicted classes: ['cat', 'dog']
Prediction scores: [np.float32(0.89869225), np.float32(0.62849027)]


# Download the VOC2007 dataset

In [6]:
import os
import requests
import tarfile
from tqdm import tqdm

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# URLs for VOC2007 dataset
voc2007_urls = {
    'trainval': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',
    'test': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar'
}

# Download and extract datasets
for split, url in voc2007_urls.items():
    # Download file
    filename = os.path.basename(url)
    filepath = os.path.join('data', filename)
    
    print(f"Downloading {filename}...")
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filepath, 'wb') as f, tqdm(
        total=total_size, unit='B', unit_scale=True, desc=filename
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            size = f.write(data)
            bar.update(size)
    
    # Extract file
    print(f"Extracting {filename}...")
    with tarfile.open(filepath) as tar:
        tar.extractall('data')
        
    # Clean up tar file to save space
    os.remove(filepath)
    
print("Download and extraction complete!")

Downloading VOCtrainval_06-Nov-2007.tar...


VOCtrainval_06-Nov-2007.tar: 100%|████████████| 460M/460M [43:35<00:00, 176kB/s]


Extracting VOCtrainval_06-Nov-2007.tar...
Downloading VOCtest_06-Nov-2007.tar...


VOCtest_06-Nov-2007.tar: 100%|████████████████| 451M/451M [15:09<00:00, 496kB/s]


Extracting VOCtest_06-Nov-2007.tar...
Download and extraction complete!


In [7]:
# Here, we create directory of the dataset for training
# Only part of the dataset is needed for this task, so we only keep necessary part, that make it easier for you to follow
import os
import shutil

# Create the target structure
os.makedirs('data/VOC2007', exist_ok=True)
os.makedirs('data/VOC2007/ImageSets', exist_ok=True)

# Check if downloaded data is in the expected location
voc_source = 'data/VOCdevkit/VOC2007'
voc_target = 'data/VOC2007'

if os.path.exists(voc_source):
    # Link JPEGImages directory
    src_jpeg = os.path.join(voc_source, 'JPEGImages')
    dst_jpeg = os.path.join(voc_target, 'JPEGImages')
    
    if os.path.exists(src_jpeg) and not os.path.exists(dst_jpeg):
        try:
            os.symlink(os.path.abspath(src_jpeg), dst_jpeg)
            print(f"Created symlink from {src_jpeg} to {dst_jpeg}")
        except OSError:
            print(f"Copying {src_jpeg} to {dst_jpeg}")
            shutil.copytree(src_jpeg, dst_jpeg)
    
    # Copy just test.txt and trainval.txt from ImageSets/Main
    src_main = os.path.join(voc_source, 'ImageSets/Main')
    dst_main = os.path.join(voc_target, 'ImageSets/Main')
    
    os.makedirs(dst_main, exist_ok=True)
    
    for file in ['test.txt', 'trainval.txt']:
        src_file = os.path.join(src_main, file)
        dst_file = os.path.join(dst_main, file)
        
        if os.path.exists(src_file) and not os.path.exists(dst_file):
            print(f"Copying {src_file} to {dst_file}")
            shutil.copy2(src_file, dst_file)
    
    # Copy Annotations directory
    src_annot = os.path.join(voc_source, 'Annotations')
    dst_annot = os.path.join(voc_target, 'Annotations')
    
    if os.path.exists(src_annot) and not os.path.exists(dst_annot):
        print(f"Copying {src_annot} to {dst_annot}")
        shutil.copytree(src_annot, dst_annot)
    
    print("Dataset structure prepared successfully!")
else:
    print(f"Error: Source directory {voc_source} not found. Check your download.")

Created symlink from data/VOCdevkit/VOC2007/JPEGImages to data/VOC2007/JPEGImages
Copying data/VOCdevkit/VOC2007/ImageSets/Main/test.txt to data/VOC2007/ImageSets/Main/test.txt
Copying data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt to data/VOC2007/ImageSets/Main/trainval.txt
Copying data/VOCdevkit/VOC2007/Annotations to data/VOC2007/Annotations
Dataset structure prepared successfully!


In [8]:
# only part of the info in xml is needed, so we simplify them
# after that, you can create your own dataset mimicking the format
import os
import xml.etree.ElementTree as ET
from glob import glob
from tqdm import tqdm

def clean_xml(input_path, output_path):
    # Parse the XML file
    tree = ET.parse(input_path)
    root = tree.getroot()
    
    # Keep only necessary elements
    necessary_elements = ['filename', 'size']
    
    # Remove unnecessary elements at the root level
    for child in list(root):
        if child.tag not in necessary_elements and child.tag != 'object':
            root.remove(child)
    
    # Process each object element - keep only name and difficult
    for obj in root.findall('object'):
        # Extract name and difficult flag
        name = obj.find('name')
        difficult = obj.find('difficult')
        
        # Remove all other child elements
        for child in list(obj):
            if child.tag != 'name' and child.tag != 'difficult':
                obj.remove(child)
    
    # Write cleaned XML to output file
    tree.write(output_path)

def main():
    # Set input and output directories
    _dir = './data/VOC2007/Annotations'
    
    # Create output directory if it doesn't exist
    os.makedirs(_dir, exist_ok=True)
    
    # Find all XML files in the input directory
    xml_files = glob(os.path.join(_dir, '*.xml'))
    
    print(f"Found {len(xml_files)} XML files to process")
    
    # Process each XML file
    for xml_file in tqdm(xml_files, desc="Cleaning XML files"):
        # Get just the filename
        basename = os.path.basename(xml_file)
        output_path = os.path.join(_dir, basename)
        
        # Clean and save the XML
        clean_xml(xml_file, output_path)
    
    print(f"Processed {len(xml_files)} XML files")
    print(f"Cleaned annotations saved to {_dir}")
    
    # Example of first processed file
    if xml_files:
        sample_file = os.path.join(_dir, os.path.basename(xml_files[0]))
        print("\nSample of cleaned XML:")
        with open(sample_file, 'r') as f:
            print(f.read())

if __name__ == "__main__":
    main()

Found 9963 XML files to process


Cleaning XML files: 100%|████████████████| 9963/9963 [00:00<00:00, 13677.03it/s]

Processed 9963 XML files
Cleaned annotations saved to ./data/VOC2007/Annotations

Sample of cleaned XML:
<annotation>
	<filename>002074.jpg</filename>
	<size>
		<width>500</width>
		<height>375</height>
		<depth>3</depth>
	</size>
	<object>
		<name>tvmonitor</name>
		<difficult>0</difficult>
		</object>
	<object>
		<name>person</name>
		<difficult>0</difficult>
		</object>
	<object>
		<name>pottedplant</name>
		<difficult>0</difficult>
		</object>
	<object>
		<name>person</name>
		<difficult>0</difficult>
		</object>
</annotation>





In [9]:
from mmpretrain.datasets import VOC

# Try loading the datasets again
train_dataset = VOC(data_root='data/VOC2007', split='trainval')
print(f"Training dataset: {len(train_dataset)} images")

test_dataset = VOC(data_root='data/VOC2007', split='test')
print(f"Test dataset: {len(test_dataset)} images")

Training dataset: 5011 images
Test dataset: 4952 images


# Train our model

In [None]:
# we can make a copy of configs/csra/resnet101-csra_1xb16_voc07-448px.py and revise
# remember to change num_classes if your num_classes is not 20

In [10]:
%run tools/train.py configs/csra/resnet101-csra_1xb16_voc07-448px.py

06/02 15:50:32 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 17565857
    GPU 0: NVIDIA GeForce RTX 5090
    CUDA_HOME: /usr/local/cuda-12.8
    NVCC: Cuda compilation tools, release 12.8, V12.8.93
    GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
    PyTorch: 2.7.0+cu128
    PyTorch compiling details: PyTorch built with:
  - GCC 11.2
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 12.8
  - NVCC architecture flags: -gencode;arch=co

In [11]:
import torch
#after training/testing, clear cache
torch.cuda.empty_cache()
torch.cuda.synchronize()