In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torchvision

In [2]:
import pandas as pd
import re
import os
import shutil
from difflib import get_close_matches

def preprocess_name(name):
    """
    Standardize product names for better matching:
      - Convert to lowercase
      - Replace abbreviations (before removing punctuation)
      - Remove special characters
    """
    # Convert to lowercase for consistency
    name = name.lower()

    # Abbreviation mappings
    replacements = {
        'c.gaz': 'cannete_gazifiée',
        'c.jus': 'cannete_jus',
        'or': 'orange',
        'pe': 'peche',
        'abr': 'abricot',
        'fr': 'fraise',
        'pom': 'pomme',
        'ban': 'banane',
        'mang': 'mangue',
        'gren': 'grenadine',
        'cit': 'citron',
        'c.malt': 'cannete_maltée',
        'w.f': 'water_fruits'
    }

    # Replace abbreviations first (keeping punctuation so that patterns match)
    for abbr, full in replacements.items():
        name = re.sub(rf'\b{re.escape(abbr)}\b', full, name)

    # Now remove special characters (including dots, spaces, underscores, etc.)
    name = re.sub(r'[^a-z0-9]', '', name)

    return name

def get_simplified_famille(filename):
    """
    Classify products into three main categories: PET, Cannete, and Autres

    Args:
        filename: The image filename
    Returns:
        str: One of 'PET', 'Cannete', or 'Autres'
    """
    filename_lower = filename.lower()
    processed_name = preprocess_name(filename_lower)

    # Check for PET products
    if 'pet' in processed_name or any(term in processed_name for term in ['energy', 'milk', 'lben']):
        return 'PET'

    # Check for Cannete products
    if 'cannete' in processed_name or any(term in processed_name for term in ['c.gaz', 'c.jus', 'c.malt']):
        return 'Cannete'

    # Everything else goes to Autres
    return 'Pack'

def group_images_by_simplified_famille(source_folder, destination_folder):
    """
    Group images into three main folders: PET, Cannete, and Autres

    Args:
        source_folder: Folder containing images
        destination_folder: Where to create classification folders
    """
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Track results
    results = {'PET': [], 'Cannete': [], 'Pack': []}

    # Create the three main folders
    for famille in ['PET', 'Cannete', 'Pack']:
        famille_folder = os.path.join(destination_folder, famille)
        if not os.path.exists(famille_folder):
            os.makedirs(famille_folder)

    # Process each image
    for filename in os.listdir(source_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            famille = get_simplified_famille(filename)

            # Move the file to appropriate folder
            source_path = os.path.join(source_folder, filename)
            dest_path = os.path.join(destination_folder, famille, filename)
            shutil.move(source_path, dest_path)
            results[famille].append(filename)

    # Print summary
    print("\nClassification Results:")
    for famille, files in results.items():
        print(f"\n{famille} ({len(files)} files):")
        for filename in files:
            print(f"- {filename}")

    return results

In [3]:
# Example usage
source_folder = "/content"
destination_folder = "/content/classes"

results = group_images_by_simplified_famille(source_folder, destination_folder)


Classification Results:

PET (54 files):
- PET_Frutty_Orange_Peche_2L.png
- ENERGY RED BUF MENTH 33CL..png
- ramy LBEN.png
- ENERGY BUF 33CL.png
- PET_Malt_Miel_33cl.png
- PET_WaterFruits_Agrumes_33cl.png
- PET_Malt_Ananas_33cl.png
- PET_Ramy_Orange_Ananas_1,25L.png
- MILKY PET 1L.png
- PET_Ramy_Cocktail_Mangue_1,25L.png
- PET_Energie_Classique_33cl.png
- PET_Ramy_Pomme_Banane_1,25L.png
- PET_Ramy_Mandarine_1,25L.png
- PET_Ramy_Cocktail_Mure_1,25L.png
- PET_Ramy_Grenadine_1,25L.png
- PET_Extra_Orange_2L.png
- PET_Malt_MFruits_33cl.png
- PET_Ramy_Trio_Fraise_Pomme_Banane_1,25L.png
- PET_Malt_Citron_33cl(1).png
- PET_WaterFruits_Mojito_33cl.png
- PET_Energie_Classique_33cl(1).png
- PET_Ramy_Fraise_1,25L.png
- PET_Ramy_Citron_1,25L.png
- ramy LBEN BIF.png
- PET_Extra_Orange__Peche_Fraise_2L.png
- PET_Malt_Ananas_33cl(1).png
- PET_Ramy_Orange_Peche_Fraise_1,25L.png
- PET_Energie_Power_Fruits_33cl.png
- ENERGY DRINK 33CL.png
- PET_Ramy_Cerise_1,25L.png
- PET_Energie_Miel_33cl.png
- PET_Mal

In [4]:
!pip install -U albumentations



In [15]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split
from PIL import Image

# Import Albumentations and its PyTorch converter
import albumentations as A
from albumentations.pytorch import ToTensorV2

# --- 1. Create a Custom Dataset that uses Albumentations ---
class AlbumentationsDataset(ImageFolder):
    def __init__(self, root, transform=None):
        # Filter out hidden directories
        self.root = root
        valid_classes = [d for d in os.listdir(root)
                        if os.path.isdir(os.path.join(root, d)) and not d.startswith('.')]

        self.classes = sorted(valid_classes)
        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}

        # Get all valid image files
        self.samples = []
        self.targets = []

        valid_extensions = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp')

        for class_name in valid_classes:
            class_idx = self.class_to_idx[class_name]
            class_dir = os.path.join(root, class_name)

            for fname in os.listdir(class_dir):
                if not fname.startswith('.') and fname.lower().endswith(valid_extensions):
                    path = os.path.join(class_dir, fname)
                    self.samples.append((path, class_idx))
                    self.targets.append(class_idx)

        self.targets = np.array(self.targets)
        self.albumentations_transform = transform

    def __getitem__(self, index):
        path, target = self.samples[index]
        # Open image and convert to RGB (as numpy array)
        image = np.array(Image.open(path).convert("RGB"))
        if self.albumentations_transform:
            augmented = self.albumentations_transform(image=image)
            image = augmented["image"]
        return image, target

# --- 2. Define Advanced Augmentations using Albumentations ---
train_transform = A.Compose([
    A.Resize(height=256, width=256),
    A.CenterCrop(height=224, width=224),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.2),
    A.Rotate(limit=30, p=0.5),
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        max_pixel_value=255.0
    ),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(height=256, width=256),
    A.CenterCrop(height=224, width=224),
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        max_pixel_value=255.0
    ),
    ToTensorV2()
])

def mixup_collate_fn(batch, alpha=0.4):
    """
    Applies MixUp augmentation on a batch.
    """
    images, labels = zip(*batch)
    images = torch.stack(images)
    labels = torch.tensor(labels)

    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1.0
    batch_size = images.size(0)
    index = torch.randperm(batch_size)

    mixed_images = lam * images + (1 - lam) * images[index, :]
    labels_a, labels_b = labels, labels[index]
    return mixed_images, labels_a, labels_b, lam

def train_model(data_dir, num_epochs=25, batch_size=32, learning_rate=0.001):
    # Load and split dataset
    full_dataset = AlbumentationsDataset(data_dir, transform=train_transform)

    # Calculate split sizes
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size

    # Create train and validation splits
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    # Override the transform for validation dataset
    val_dataset.dataset.albumentations_transform = val_transform

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        collate_fn=lambda batch: mixup_collate_fn(batch, alpha=0.4)
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4
    )

    # Setup device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Setup model
    model = models.resnet50(pretrained=True)
    num_classes = len(full_dataset.classes)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    model = model.to(device)

    # Print class information
    print(f"Classes found: {full_dataset.classes}")
    print(f"Class to idx mapping: {full_dataset.class_to_idx}")

    # Setup training
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

    best_val_acc = 0.0

    # Training loop
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels_a, labels_b, lam in train_loader:
            inputs = inputs.to(device)
            labels_a = labels_a.to(device)
            labels_b = labels_b.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            # MixUp loss
            loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            running_corrects += torch.sum(preds == labels_a.data)

        epoch_loss = running_loss / train_size
        epoch_acc = running_corrects.double() / train_size

        print(f'Training Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_corrects = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_corrects += torch.sum(preds == labels.data)

        epoch_val_loss = val_loss / val_size
        epoch_val_acc = val_corrects.double() / val_size

        print(f'Validation Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.4f}')

        # Save best model
        if epoch_val_acc > best_val_acc:
            best_val_acc = epoch_val_acc
            torch.save(model.state_dict(), 'best_model.pth')

        scheduler.step()

    print('Training completed!')
    return model

# Usage
if __name__ == "__main__":
    data_dir = '/content/classes'  # Update this path to your dataset directory
    model = train_model(data_dir)

Using device: cuda:0


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 182MB/s]


Classes found: ['Cannete', 'PET', 'Pack']
Class to idx mapping: {'Cannete': 0, 'PET': 1, 'Pack': 2}

Epoch 1/25
----------
Training Loss: 1.0256 Acc: 0.4318
Validation Loss: 0.9073 Acc: 0.6522

Epoch 2/25
----------
Training Loss: 0.9090 Acc: 0.5795
Validation Loss: 0.7208 Acc: 0.7391

Epoch 3/25
----------
Training Loss: 0.8040 Acc: 0.5795
Validation Loss: 0.5685 Acc: 0.8696

Epoch 4/25
----------
Training Loss: 0.7389 Acc: 0.5682
Validation Loss: 0.4612 Acc: 0.8696

Epoch 5/25
----------
Training Loss: 0.6308 Acc: 0.5227
Validation Loss: 0.3776 Acc: 0.9130

Epoch 6/25
----------
Training Loss: 0.5232 Acc: 0.5795
Validation Loss: 0.3213 Acc: 0.9565

Epoch 7/25
----------
Training Loss: 0.6688 Acc: 0.7273
Validation Loss: 0.2868 Acc: 1.0000

Epoch 8/25
----------
Training Loss: 0.3576 Acc: 0.8295
Validation Loss: 0.2827 Acc: 1.0000

Epoch 9/25
----------
Training Loss: 0.3913 Acc: 0.7500
Validation Loss: 0.2791 Acc: 1.0000

Epoch 10/25
----------
Training Loss: 0.3353 Acc: 0.5227
Valid

In [16]:
import pickle

# Assume 'model' is your trained model object
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved successfully.")

Model saved successfully.
