#Preprocessing Parallélisé

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:

from datetime import datetime
from joblib import Parallel, delayed
import multiprocessing
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
import torch
import pandas as pd
import json
import os
import time

IMG_SIZE = 160
N_JOBS = -1
BATCH_SIZE = 32
CSV_PATH = "/content/drive/MyDrive/Deep_Learning_project/data/train_final_essaie.csv"
SAVE_PATH = "/content/drive/MyDrive/Deep_Learning_project/data/train_total_160x160.pt"
LABELS_JSON = "/content/drive/MyDrive/Deep_Learning_project/data/country_labels_total_essaie.json"




In [15]:
df = pd.read_csv(CSV_PATH)
COUNTER = df["country"].value_counts()
RARE_THRESHOLD = 1000
CLASSES_RARES = set(COUNTER[COUNTER < RARE_THRESHOLD].index)

In [4]:
df.head()

Unnamed: 0,id,url,landmark_id,category_name,name,lat,lon,city,state,country,image_path
0,202cd79556f30760,http://upload.wikimedia.org/wikipedia/commons/...,104169,Category:Stirling_Castle,Stirling Castle,56.123889,-3.947778,Stirling,Scotland,United Kingdom,/content/drive/My Drive/Deep_Learning_project/...
1,4072182eddd0100e,https://upload.wikimedia.org/wikipedia/commons...,2474,Category:River_Severn,Aylburton,51.685278,-2.543611,Forest of Dean,England,United Kingdom,/content/drive/My Drive/Deep_Learning_project/...
2,16d8aa057cdd01b9,http://upload.wikimedia.org/wikipedia/commons/...,25719,Category:Duomo_(Monza),Monza Cathedral,45.58359,9.27567,Monza,Lombardy,Italy,/content/drive/My Drive/Deep_Learning_project/...
3,0851a257e5e872ef,https://upload.wikimedia.org/wikipedia/commons...,189446,Category:Castle_of_Peñíscola,Castillo de Peñiscola,40.3588,0.407926,Peníscola / Peñíscola,Valencian Community,Spain,/content/drive/My Drive/Deep_Learning_project/...
4,053d1409647dfba2,https://upload.wikimedia.org/wikipedia/commons...,74821,Category:Eusebius_Church_(Arnhem),Sint-Eusebiuskerk,51.978889,5.91,Arnhem,Gelderland,Netherlands,/content/drive/My Drive/Deep_Learning_project/...


In [5]:
from torchvision import transforms
from PIL import Image
from collections import defaultdict
from joblib import Parallel, delayed
import torch
import time
import multiprocessing
from tqdm import tqdm

class Preprocessing:
    def __init__(self):
        self.standard_transform = transforms.Compose([
            transforms.Resize((IMG_SIZE, IMG_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

        self.rare_transform = transforms.Compose([
            transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

    def image_to_tensor(self, image_path, label=None):
        try:
            image = Image.open(image_path).convert("RGB")
            transform = self.rare_transform if label in CLASSES_RARES else self.standard_transform
            tensor = transform(image).unsqueeze(0)
            rare_path = image_path if label in CLASSES_RARES else None
            return label, tensor, rare_path
        except Exception as e:
            print(f"Erreur lors du traitement de {image_path}: {e}")
            return None

    def create_label_mapping(self, labels):
        unique_labels = sorted(set(labels))
        label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
        idx_to_label = {idx: label for label, idx in label_to_idx.items()}
        return label_to_idx, idx_to_label

    def stack_tensors(self, tensors_by_label, rare_images_by_label=None, multiplier=5):
        label_to_idx, idx_to_label = self.create_label_mapping(tensors_by_label.keys())
        all_tensors, all_labels = [], []

        for label, tensors in tensors_by_label.items():
            label_idx = label_to_idx[label]

            if label in CLASSES_RARES and rare_images_by_label:
                image_paths = rare_images_by_label[label]
                for path in image_paths:
                    try:
                        image = Image.open(path).convert("RGB")
                        for _ in range(multiplier):
                            aug_tensor = self.rare_transform(image).unsqueeze(0)
                            all_tensors.append(aug_tensor)
                            all_labels.append(label_idx)
                    except Exception as e:
                        print(f"Erreur lors de l'augmentation de {path}: {e}")
            else:
                all_tensors.extend(tensors)
                all_labels.extend([label_idx] * len(tensors))

        try:
            images_tensor = torch.cat(all_tensors)
            labels_tensor = torch.tensor(all_labels, dtype=torch.long)
            return images_tensor, labels_tensor, idx_to_label
        except Exception as e:
            print(f"Erreur lors de l'empilement des tenseurs: {e}")
            return None, None, None

    def store_image_tensors(self, df, image_path_column='image_path', label_column='country', n_jobs=-1):
        image_paths = df[image_path_column].tolist()
        labels = df[label_column].tolist()

        start_time = time.time()

        results = Parallel(n_jobs=n_jobs)(
            delayed(self.image_to_tensor)(path, label)
            for path, label in tqdm(zip(image_paths, labels), total=len(labels))
        )

        tensors_by_label = defaultdict(list)
        rare_images_by_label = defaultdict(list)

        for res in results:
            if res is not None:
                label, tensor, rare_path = res
                tensors_by_label[label].append(tensor)
                if rare_path:
                    rare_images_by_label[label].append(rare_path)

        images_tensor, labels_tensor, idx_to_label = self.stack_tensors(
            tensors_by_label, rare_images_by_label, multiplier=5
        )

        end_time = time.time()
        print(f"Temps total de traitement : {end_time - start_time:.2f} secondes")

        return images_tensor, labels_tensor, idx_to_label


In [16]:
df = df[df['image_path'].notna()]
df = df[~df['image_path'].str.contains("Image introuvable")]
df = df.sample(frac=1, random_state=42).head(100000)

In [17]:
df.shape

(100000, 11)

In [1]:
import torch
import torchvision
import os

print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU disponible :", torch.cuda.get_device_name(0))
    print("Mémoire GPU totale :", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2), "Go")
else:
    device = torch.device("cpu")
    print("GPU non détecté, utilisation du CPU")

torch.set_num_threads(2)

PyTorch version: 2.6.0+cu124
Torchvision version: 0.21.0+cu124
GPU disponible : NVIDIA A100-SXM4-40GB
Mémoire GPU totale : 39.56 Go


In [None]:
prepro = Preprocessing()
tensors, labels, label_mapping = prepro.store_image_tensors(df, n_jobs=N_JOBS)

📦 Traitement de 100000 images avec 12 CPU...


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
 70%|███████   | 70008/100000 [32:07<12:02, 41.54it/s][A[A

 70%|███████   | 70020/100000 [32:07<11:55, 41.89it/s][A[A

 70%|███████   | 70032/100000 [32:07<11:16, 44.28it/s][A[A

 70%|███████   | 70044/100000 [32:07<11:15, 44.33it/s][A[A

 70%|███████   | 70056/100000 [32:08<11:08, 44.82it/s][A[A

 70%|███████   | 70068/100000 [32:08<11:08, 44.79it/s][A[A

 70%|███████   | 70080/100000 [32:08<12:00, 41.51it/s][A[A

 70%|███████   | 70092/100000 [32:09<13:33, 36.79it/s][A[A

 70%|███████   | 70104/100000 [32:09<14:41, 33.91it/s][A[A

 70%|███████   | 70116/100000 [32:09<15:58, 31.17it/s][A[A

 70%|███████   | 70128/100000 [32:10<14:33, 34.21it/s][A[A

 70%|███████   | 70140/100000 [32:10<13:33, 36.69it/s][A[A

 70%|███████   | 70152/100000 [32:10<12:44, 39.05it/s][A[A

 70%|███████   | 70164/100000 [32:11<12:11, 40.77it/s][A[A

 70%|███████   | 70176/100000 [32:11<12:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split, DataLoader
from torchvision.models import efficientnet_b0
from sklearn.metrics import accuracy_score, top_k_accuracy_score
import datetime
import json

MODEL_SAVE_PATH = f"/content/drive/MyDrive/Deep_Learning_project/models/efficientnet_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pt"
BATCH_SIZE = 32
EPOCHS = 10
LR = 1e-4
VAL_RATIO = 0.1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, top_k_accuracy_score

VAL_RATIO = 0.2
BATCH_SIZE = 32
EPOCHS = 50
LR = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_SAVE_PATH = "best_model.pth"
patience = 5

labels = labels.long()
dataset = torch.utils.data.TensorDataset(tensors, labels)

with open(LABELS_JSON, "r") as f:
    label_mapping = json.load(f)
NUM_CLASSES = len(label_mapping)

val_size = int(len(dataset) * VAL_RATIO)
train_size = len(dataset) - val_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

weights = EfficientNet_B0_Weights.DEFAULT
model = efficientnet_b0(weights=weights)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, NUM_CLASSES)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

best_val_loss = float("inf")
patience_counter = 0
train_losses, val_losses = [], []

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        assert labels.dtype == torch.long
        assert labels.min() >= 0
        assert labels.max() < NUM_CLASSES

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)

    avg_train_loss = running_loss / len(train_loader.dataset)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)
    avg_val_loss = val_loss / len(val_loader.dataset)

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    print(f"[{epoch+1}/{EPOCHS}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"Modèle sauvegardé: {MODEL_SAVE_PATH}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping déclenché à l'epoch {epoch+1}")
            break

plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Courbes de perte")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

model.eval()
y_true, y_pred, y_proba = [], [], []
with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(DEVICE)
        outputs = model(images)
        probs = torch.softmax(outputs, dim=1)
        preds = torch.argmax(probs, dim=1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_proba.extend(probs.cpu().numpy())

acc = accuracy_score(y_true, y_pred)
top5 = top_k_accuracy_score(y_true, y_proba, k=5)
print(f"Accuracy: {acc:.4f} | Top-5 Accuracy: {top5:.4f}")

In [None]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()

y_true, y_pred, y_proba = [], [], []

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(DEVICE)
        outputs = model(images)
        probs = torch.softmax(outputs, dim=1)
        preds = torch.argmax(probs, dim=1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_proba.extend(probs.cpu().numpy())

acc = accuracy_score(y_true, y_pred)
top5 = top_k_accuracy_score(y_true, y_proba, k=5)

print(f"Accuracy: {acc:.4f} | Top-5 Accuracy: {top5:.4f}")

In [None]:
dataset = torch.utils.data.TensorDataset(tensors, labels)
torch.save(dataset, SAVE_PATH)
print(f"Dataset sauvegardé sous : {SAVE_PATH}")

with open(LABELS_JSON, "w") as f:
    json.dump(label_mapping, f)
print(f"Mapping sauvegardé sous : {LABELS_JSON}")