In [None]:
import os
import shutil
import numpy as np
from PIL import Image
from torchvision import models, transforms
import torch
from torch.utils.data import DataLoader, Dataset
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import LabelEncoder
from collections import Counter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
resnet = models.resnet50(weights="IMAGENET1K_V1")
resnet.fc = torch.nn.Identity() 
resnet = resnet.to(device)
resnet.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Custom dataset for loading images
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)
        label = os.path.basename(os.path.dirname(image_path))  # Class name
        return image, label, image_path

def extract_features_in_batches(image_paths, batch_size=32):
    dataset = ImageDataset(image_paths, transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    features, labels, paths = [], [], []

    for images, batch_labels, batch_paths in dataloader:
        images = images.to(device)
        with torch.no_grad():
            batch_features = resnet(images).cpu().numpy()
        features.extend(batch_features)
        labels.extend(batch_labels)
        paths.extend(batch_paths)
    
    return np.array(features), labels, paths

# DATASET
image_paths = []
dataset_path = '../../ORIGINAL DATASET'

for class_name in os.listdir(dataset_path):
    class_folder = os.path.join(dataset_path, class_name)
    if os.path.isdir(class_folder):
        for image_name in os.listdir(class_folder):
            image_paths.append(os.path.join(class_folder, image_name))

# Extract features
features, labels, image_paths = extract_features_in_batches(image_paths)




In [None]:
# Use features to for encoder and resample the dataset
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Apply Near Miss to balance the classes from encouded labels
nearmiss = NearMiss(version=1)
features_resampled, y_resampled = nearmiss.fit_resample(features, y)

# Get the selected image paths
selected_indices = nearmiss.sample_indices_
selected_image_paths = [image_paths[i] for i in selected_indices]

print("Class distribution after Near Miss:", Counter(y_resampled))

Class distribution after Near Miss: Counter({np.int64(0): 16, np.int64(1): 16, np.int64(2): 16, np.int64(3): 16, np.int64(4): 16, np.int64(5): 16, np.int64(6): 16, np.int64(7): 16, np.int64(8): 16, np.int64(9): 16, np.int64(10): 16, np.int64(11): 16, np.int64(12): 16, np.int64(13): 16, np.int64(14): 16, np.int64(15): 16, np.int64(16): 16, np.int64(17): 16, np.int64(18): 16, np.int64(19): 16, np.int64(20): 16, np.int64(21): 16, np.int64(22): 16})


In [10]:
# SAVE THE BALANCED DATASET
output_folder = '../../balanced_dataset'

os.makedirs(output_folder, exist_ok=True)

for image_path in selected_image_paths:
    class_name = os.path.basename(os.path.dirname(image_path))
    class_folder = os.path.join(output_folder, class_name)
    os.makedirs(class_folder, exist_ok=True)
    shutil.copy(image_path, class_folder)