In [1]:
from google.colab import files
files.upload()  # Upload kaggle.json here


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"comptelol","key":"e63d845fa8d1890f86a51fa2a6547bfd"}'}

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!kaggle datasets download -d marcozuppelli/stegoimagesdataset


Dataset URL: https://www.kaggle.com/datasets/marcozuppelli/stegoimagesdataset
License(s): DbCL-1.0
Downloading stegoimagesdataset.zip to /content
100% 1.51G/1.51G [01:09<00:00, 23.8MB/s]
100% 1.51G/1.51G [01:09<00:00, 23.2MB/s]


In [4]:
!unzip stegoimagesdataset.zip -d stegoimagesdataset


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  inflating: stegoimagesdataset/val/val/stego/image_06334_ps_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06334_ps_1.png  
  inflating: stegoimagesdataset/val/val/stego/image_06335_eth_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06335_html_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06335_url_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06336_html_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06336_html_1.png  
  inflating: stegoimagesdataset/val/val/stego/image_06336_url_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06337_html_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06337_js_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06337_url_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06338_eth_0.png  
  inflating: stegoimagesdataset/val/val/stego/image_06338_js_0.png 

In [5]:
import shutil
import os

# Chemin de base
base_path = 'stegoimagesdataset/test/test/'

# Dossiers à supprimer
folders_to_delete = ['stego_b64', 'stego_zip']

# Suppression des dossiers
for folder in folders_to_delete:
    folder_path = os.path.join(base_path, folder)
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        print(f"Deleted folder : {folder_path}")
    else:
        print(f"Can't find the folder : {folder_path}")

Deleted folder : stegoimagesdataset/test/test/stego_b64
Deleted folder : stegoimagesdataset/test/test/stego_zip


In [6]:
import os
from PIL import Image

def convert_clean_rgba_to_rgb(root_dirs):
    """
    Parcourt uniquement les sous-dossiers 'clean' de chacun des répertoires racines
    et convertit les images PNG RGBA en RGB.
    """
    for root in root_dirs:
        clean_dir = os.path.join(root, 'clean')
        if not os.path.isdir(clean_dir):
            continue

        for fname in os.listdir(clean_dir):
            if not fname.lower().endswith('.png'):
                continue

            path = os.path.join(clean_dir, fname)
            try:
                with Image.open(path) as img:
                    if img.mode == 'RGBA':
                        rgb_img = img.convert('RGB')
                        rgb_img.save(path)
                        print(f"Converted RGBA → RGB: {path}")
            except Exception as e:
                print(f"Error processing {path}: {e}")

# Exemple d'utilisation :
train_dir = 'stegoimagesdataset/train/train'
val_dir   = 'stegoimagesdataset/val/val'
test_dir  = 'stegoimagesdataset/test/test'

convert_clean_rgba_to_rgb([train_dir, val_dir, test_dir])

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/00345.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/01239.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/01364.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/03310.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/00358.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/01329.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/01610.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/03138.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/02468.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/03176.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/00183.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/03999.png
Converted RGBA → RGB: stegoimagesdataset/train/train/clean/02955.png
Converted R

In [3]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
import random
import re
import cv2
import numpy as np

class BalancedStegoDatasetGPU(Dataset):
    def __init__(self, clean_dir, stego_dir, transform=None, device='cuda', apply_laplacian=False):
        self.device = device
        self.transform = transform
        self.apply_laplacian = apply_laplacian

        self.images = []
        self.labels = []

        clean_imgs = sorted([f for f in os.listdir(clean_dir) if f.endswith('.png')])

        for clean_img in clean_imgs:
            clean_path = os.path.join(clean_dir, clean_img)
            base_id = os.path.splitext(clean_img)[0]

            img_clean = Image.open(clean_path).convert("RGB")
            img_clean = self.preprocess(img_clean)
            self.images.append(img_clean.to(self.device))
            self.labels.append(torch.tensor(0, dtype=torch.long, device=self.device))

            pattern = f"image_{base_id}_.+\\.png"
            stego_matches = [f for f in os.listdir(stego_dir) if re.match(pattern, f)]

            if stego_matches:
                #stego_img = random.choice(stego_matches)
                for stego_img in stego_matches:
                  stego_path = os.path.join(stego_dir, stego_img)
                  img_stego = Image.open(stego_path).convert("RGB")
                  img_stego = self.preprocess(img_stego)
                  self.images.append(img_stego.to(self.device))
                  self.labels.append(torch.tensor(1, dtype=torch.long, device=self.device))

        self.images = torch.stack(self.images)
        self.labels = torch.stack(self.labels)

    def preprocess(self, img_pil):

        img_np = np.array(img_pil)

        if self.apply_laplacian:
            laplacian_channels = []
            for c in range(3):
                channel = img_np[:, :, c]
                laplacian = cv2.Laplacian(channel, cv2.CV_32F)
                laplacian = cv2.convertScaleAbs(laplacian)
                laplacian_channels.append(laplacian)

            img_np = np.stack(laplacian_channels, axis=-1)

        img_pil = Image.fromarray(img_np)

        if self.transform:
            img_tensor = self.transform(img_pil)
        else:
            img_tensor = transforms.ToTensor()(img_pil)

        return img_tensor

    def __getitem__(self, index):
        return self.images[index], self.labels[index]

    def __len__(self):
        return len(self.labels)



- RandomAffine	Simule de petites translations ➔ le modèle doit apprendre des patterns locaux, pas des positions
- ColorJitter	Simule des changements de conditions lumineuses ➔ force le modèle à se concentrer sur structure et non couleur
- GaussianBlur	Ajoute du micro bruit pour forcer le modèle à être plus sensible aux détails "fins"

In [2]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets
from PIL import Image
import os

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=5),
    transforms.RandomApply([
        transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 0.5))
    ], p=0.2),
    transforms.RandomApply([
        transforms.ColorJitter(contrast=0.1, brightness=0.1)
    ], p=0.3),
    transforms.RandomAffine(degrees=0, translate=(0.02, 0.02)),
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dir = 'stegoimagesdataset/train/train/'
val_dir = 'stegoimagesdataset/val/val/'
test_dir = 'stegoimagesdataset/test/test/'

train_dataset = BalancedStegoDatasetGPU(
    clean_dir='stegoimagesdataset/train/train/clean',
    stego_dir='stegoimagesdataset/train/train/stego',
    transform=train_transform,
    device='cuda',
    apply_laplacian=True
)

val_dataset = BalancedStegoDatasetGPU(
    clean_dir='stegoimagesdataset/val/val/clean',
    stego_dir='stegoimagesdataset/val/val/stego',
    transform=transform,
    device='cuda',
    apply_laplacian=True
)

"""
test_dataset = BalancedStegoDatasetGPU(
    clean_dir='stegoimagesdataset/test/test/clean',
    stego_dir='stegoimagesdataset/test/test/stego',
    transform=transform,
    device='cuda',
    apply_laplacian=True
)
"""

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [4]:

test_dataset = BalancedStegoDatasetGPU(
    clean_dir='stegoimagesdataset/test/test/clean',
    stego_dir='stegoimagesdataset/test/test/stego',
    transform=transform,
    device='cuda',
    apply_laplacian=True
)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [5]:
from collections import Counter

labels_train = [int(label) for _, label in train_dataset]
labels_val = [int(label) for _, label in val_dataset]
labels_test = [int(label) for _, label in test_dataset]
print(Counter(labels_train))
print(Counter(labels_val))
print(Counter(labels_test))

for input, labels in train_loader:
  print(input.size())
  print(labels.size())
  break

Counter({0: 4000, 1: 4000})
Counter({0: 2000, 1: 2000})
Counter({1: 6000, 0: 2000})
torch.Size([32, 3, 256, 256])
torch.Size([32])


In [6]:
import torch
import torch.nn as nn
import torchvision.models as models

# Use the new weights parameter instead of pretrained
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)


# Replace the last layer to match the number of classes in the dataset
#model.fc = nn.Linear(model.fc.in_features, 2)  # 2 classes : clean and stego
model.fc = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(model.fc.in_features, 2)
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

#print(model)

Using device: cuda


In [7]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch
import time

def train(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    num_epochs=100,
    patience=10,
    save_path="best_model.pth"
):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    scheduler = ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=5,
        verbose=True
    )

    print(f"Starting training for model: {model.__class__.__name__} at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} for {num_epochs} epochs")


    for epoch in range(1, num_epochs+1):
        # ---- TRAIN ----
        model.train()
        running_loss = 0.0
        correct = total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_loss = running_loss / total
        train_acc  = 100 * correct / total

        model.eval()
        val_loss = 0.0
        val_correct = val_total = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss    = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                preds     = outputs.argmax(dim=1)
                val_correct += (preds == labels).sum().item()
                val_total   += labels.size(0)

        val_loss = val_loss / val_total
        val_acc  = 100 * val_correct / val_total

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), save_path)
            improve_str = "New Best Model found!"
        else:
            epochs_no_improve += 1
            improve_str = f"No improvement ({epochs_no_improve}/{patience})"

        print(
            f"Epoch {epoch:>3}/{num_epochs}  "
            f"| Train Loss: {train_loss:.4f}, Train Acc: {train_acc:5.2f}%  "
            f"| Val Loss: {val_loss:.4f}, Val Acc: {val_acc:5.2f}%  "
            f"{improve_str}"
        )

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch} epochs (val_loss didn't drop).")
            break

    print(f"Training complete. Best Validation Loss: {best_val_loss:.4f}")
    return model



In [8]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss() # Cross entropy loss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4) # Adam optimizer

trained_model = train(model, train_loader, val_loader, criterion, optimizer, 100, 25)



Starting training for model: ResNet at 2025-04-28 13:01:26 for 100 epochs
Epoch   1/100  | Train Loss: 0.6553, Train Acc: 62.91%  | Val Loss: 0.4122, Val Acc: 89.53%  New Best Model found!
Epoch   2/100  | Train Loss: 0.4023, Train Acc: 82.24%  | Val Loss: 0.6487, Val Acc: 76.45%  No improvement (1/25)
Epoch   3/100  | Train Loss: 0.3501, Train Acc: 85.21%  | Val Loss: 0.1562, Val Acc: 99.47%  New Best Model found!
Epoch   4/100  | Train Loss: 0.3493, Train Acc: 85.00%  | Val Loss: 5.7867, Val Acc: 50.00%  No improvement (1/25)
Epoch   5/100  | Train Loss: 0.3267, Train Acc: 86.10%  | Val Loss: 0.0976, Val Acc: 97.58%  New Best Model found!
Epoch   6/100  | Train Loss: 0.3074, Train Acc: 86.97%  | Val Loss: 0.6964, Val Acc: 76.58%  No improvement (1/25)
Epoch   7/100  | Train Loss: 0.3186, Train Acc: 85.54%  | Val Loss: 0.6707, Val Acc: 76.92%  No improvement (2/25)
Epoch   8/100  | Train Loss: 0.2715, Train Acc: 88.11%  | Val Loss: 1.0827, Val Acc: 76.58%  No improvement (3/25)
Epoch 

In [9]:
import torch
import matplotlib.pyplot as plt
import torchvision.transforms.functional as F
import numpy as np
from sklearn.metrics import classification_report

def test_model(model, loader, device):
    model.eval()
    model.to(device)

    all_preds = []
    all_labels = []
    all_images = []

    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)

            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())
            all_images.append(images.cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    all_images = torch.cat(all_images)

    return all_images, all_preds, all_labels

def show_misclassified_images(images, preds, labels, classes, max_images=10):
    incorrect_indices = (preds != labels).nonzero(as_tuple=True)[0]
    print(f"Nombre d'erreurs : {len(incorrect_indices)}")

    if len(incorrect_indices) == 0:
        print("Aucun échantillon mal classifié.")
        return

    plt.figure(figsize=(15, 5))

    for idx, wrong_idx in enumerate(incorrect_indices[:max_images]):
        img = images[wrong_idx]
        img = F.normalize(img, mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], std=[1/0.229, 1/0.224, 1/0.225])  # Unnormalize
        img = torch.clamp(img, 0, 1)
        img = img.permute(1, 2, 0).numpy()

        true_label = classes[labels[wrong_idx]]
        pred_label = classes[preds[wrong_idx]]

        plt.subplot(2, 5, idx+1)
        plt.imshow(img)
        plt.title(f"T: {true_label} | P: {pred_label}")
        plt.axis('off')

    plt.tight_layout()
    plt.show()

def generate_classification_report(labels, preds, classes):
    report = classification_report(labels, preds, target_names=classes)
    print("\n=== 📊 Rapport de classification ===\n")
    print(report)


In [10]:
model.load_state_dict(torch.load('best_model.pth'))

images, preds, labels = test_model(model, test_loader, device)

classes = ['clean', 'stego']
#show_misclassified_images(images, preds, labels, classes, max_images=10)

generate_classification_report(labels, preds, classes)


  model.load_state_dict(torch.load('best_model.pth'))



=== 📊 Rapport de classification ===

              precision    recall  f1-score   support

       clean       0.99      0.98      0.99      2000
       stego       0.99      1.00      1.00      6000

    accuracy                           0.99      8000
   macro avg       0.99      0.99      0.99      8000
weighted avg       0.99      0.99      0.99      8000



le modèle a appris à reconnaitre facilement les images clean, par contre il a du mal à reconnaitres les images stégo:

modifications LSB subtiles ?
pas de diversité dans les stégo-images ?
en tout cas le modèle ne peut plus se reposer sur le fait qu'il y a beaucoup plus de stego que de clean.


à tester :
tester une résolution plus grande ???
augmentation, googd
fined tuning plus lent ?
features laplacian ou dct pour capter les changements fins

une loss pondéréee pour, pénaliser plus fort si mauvaise prédiction stégo ?

appliquer laplacian sur les images avant de les passer au CNN, utiliser à la fois l'image et son laplacien en input ?


Le modèle généralise extrêmement bien ➔ il ne s'est pas écroulé malgré Dropout + L2.

Pas de sur-apprentissage ➔ même après 67 epochs, val_loss est restée super stable.

Impact du Laplacien est majeur ➔ car sans Laplacien, le modèle peinait à atteindre des scores >80%-90%.

Dropout + L2 ont rendu le modèle plus robuste sans "casser" sa capacité à apprendre.

