The task is to detect the presence of AAR cracks on the surface of concrete walls. These cracks are quite thin, web-shaped, and subtle to detect precisely.
We have recently extracted a dataset of small image patches (128*128px), which is suitable for binary classification. Data is already split in train/val/test folds.
Note that the patches were extracted from large wall images (which you can deduce from the filenames), and each train/val/test split contains patches from different walls to ensure proper evaluation. Also note that the dataset is unbalanced (about 13% of positive samples), this has to be taken into account for training and evaluation.
I see following tasks:
1. Train at least two different neural network classifiers, one being a CNN (e.g., ResNet, VGG, AlexNet...), and the other being a vision transformer (ViT).
2. Optimize their performance by adjusting the model size, learning parameters, use of pretrained weights, data augmentations, sampling strategies to tackle the imbalance, etc.
3. Visualize and analyze the predictions, in particular the mistakes of the model. For ViT, visualize the attention maps.
As a deliverable, along with the 4 page report and a working code, also include 3 text files with the final predictions of each model over the three data splits.

### Imports

In [22]:
# imports
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import os
from PIL import Image
import random

### Data Loading

In [12]:
# Transformations pour le prétraitement des images
transform = transforms.Compose([
    #transforms.Resize((224, 224)),  # Redimensionne les images
    transforms.ToTensor(),  # Convertit en tenseur
    #transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalisation
])

# Chargement des datasets 
train_dir = "D:\\EPFL\\MA1\\Machine Learning\\Projet 2\\train"
val_dir = "D:\\EPFL\\MA1\\Machine Learning\\Projet 2\\validation"

train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
val_dataset = datasets.ImageFolder(root=val_dir, transform=transform)

# Chargement des datasets en DataLoader pour le batching
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=32, shuffle=False)

# Accéder à un batch d'images et labels
images, labels = next(iter(train_loader))
print(images.shape, labels)

images, labels = next(iter(val_loader))
print(images.shape, labels)



torch.Size([32, 3, 128, 128]) tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0])
torch.Size([32, 3, 128, 128]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


### Data augmentation

In [20]:
# Définir des transformations pour l'augmentation
augmentation_transforms = transforms.Compose([
    transforms.RandomRotation(degrees=45),  # Rotation aléatoire de -45 à 45 degrés
    transforms.RandomAffine(degrees=0, translate=(0.2, 0.2)),  # Translation (20% de l'image)
    transforms.ToTensor(),  # Convertit en tenseur
    transforms.ToPILImage()  # Convertit en image PIL pour sauvegarder
])

In [28]:
def augment_class_images(source_dir, output_dir, class_name, num_augmentations=5):
    """
    Applique des augmentations aux images d'une classe spécifique et les sauvegarde.
    
    :param source_dir: Chemin vers le dossier contenant les données d'origine.
    :param output_dir: Chemin vers le dossier où les images augmentées seront sauvegardées.
    :param class_name: Nom du sous-dossier de la classe cible.
    :param num_augmentations: Nombre d'augmentations par image.
    """
    input_class_dir = os.path.join(source_dir, class_name)
    output_class_dir = os.path.join(output_dir, class_name)
    os.makedirs(output_class_dir, exist_ok=True)

    # Parcourir toutes les images de la classe
    for img_name in os.listdir(input_class_dir):
        img_path = os.path.join(input_class_dir, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
            
            # Générer des augmentations
            for i in range(num_augmentations):
                augmented_image = augmentation_transforms(image)
                # Sauvegarder avec un nouveau nom
                new_img_name = f"{os.path.splitext(img_name)[0]}_aug_{i}.jpg"
                augmented_image.save(os.path.join(output_class_dir, new_img_name))

        except Exception as e:
            print(f"Erreur lors de l'augmentation de {img_name}: {e}")

# Exemple d'utilisation
source_directory = "D:\\EPFL\\MA1\\Machine Learning\\Projet 2\\train"
output_directory = "Project 2\\augmented_train"
augment_class_images(source_directory, output_directory, class_name="Cracked", num_augmentations=2)


In [None]:
# Visualiser quelques images augmentées
augmented_dir = "Project 2\\augmented_train\\Cracked"
augmented_images = os.listdir(augmented_dir)[:5]

for img_name in augmented_images:
    img_path = os.path.join(augmented_dir, img_name)
    img = Image.open(img_path)
    plt.imshow(img)
    plt.title(img_name)
    plt.axis('off')
    plt.show()


FileNotFoundError: [WinError 3] Le chemin d’accès spécifié est introuvable: 'D:\\EPFL\\MA1\\Machine Learning\\Projet 2\\augmented_train\\cracks'

### Models

In [15]:
class CrackDetectionCNN(nn.Module):
    def __init__(self):
        super(CrackDetectionCNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),  # Conv1
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Pool1
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Conv2
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)  # Pool2
        )
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 32 * 32, 128),  # FC1
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 1),  # FC2 (binaire)
            nn.Sigmoid()  # Sortie entre 0 et 1
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CrackDetectionCNN().to(device)

criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device).float()  # Labels en float pour BCELoss
            
            optimizer.zero_grad()
            outputs = model(images).squeeze()  # Réduit les dimensions pour correspondre aux labels
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device).float()
                outputs = model(images).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Prédictions
                predicted = (outputs > 0.5).float()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}, "
              f"Val Accuracy: {100 * correct/total:.2f}%")
        

In [None]:
# beaucoup trop long
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5)

KeyboardInterrupt: 

In [None]:
# Charger et prétraiter une image
def predict_image(image_path, model):
    model.eval()
    transform = transforms.Compose([
        #transforms.Resize((128, 128)),
        transforms.ToTensor(),
        #transforms.Normalize(mean=[0.5], std=[0.5])
    ])
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # Ajouter une dimension batch
    
    output = model(image).item()
    return "Crack Detected" if output > 0.5 else "No Crack Detected"

image_path = "D:\\EPFL\\MA1\\Machine Learning\\Projet 2\\test\\example.jpg"
result = predict_image(image_path, model)
print(result)

In [None]:
def visualize_predictions(data_loader, model, num_images=5):
    model.eval()
    images_shown = 0
    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).squeeze()
            predicted = (outputs > 0.5).float()

            for i in range(images.size(0)):
                if images_shown >= num_images:
                    return
                plt.imshow(images[i]) #plt.imshow(images[i].permute(1, 2, 0).cpu() * 0.5 + 0.5)  # Dé-normaliser
                plt.title(f"Label: {int(labels[i].item())}, Predicted: {int(predicted[i].item())}")
                plt.show()
                images_shown += 1

visualize_predictions(val_loader, model)