# Projet Fixmatch

In [None]:
# !pip install torchview torchsummary torchvision kornia torchmetrics matplotlib tqdm path graphviz opencv-python scikit-learn optuna

In [1]:
# system
import sys
sys.path.append('..')

# data
import numpy as np

# deep learning
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# torchvision
import torchvision

# plotting
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

# utils and model
from utils.utils import (
    seedEverything,
    compute_mean_std,
    data_transform,
    normalize,
    plot_images,
)

from utils.model import ConvNN

# FIXMATCH
from fixmatch_DA import fixmatch_DA_train

# os
import os
import path

# sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.cluster import KMeans

%load_ext autoreload
%autoreload 2

In [2]:
# Set device
if ((int(torch.__version__.split(".")[0]) >= 2) or (int(torch.__version__.split(".")[1]) >= 13)) and torch.has_mps:
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

In [3]:
IMG_SHAPE = (3, 32, 32)
# See Table 4
TAU = 0.9
LAMBDA_U = 3
MU = 4
BATCH_SIZE = 64
LR = 0.03
BETA = 0.9
WEIGHT_DECAY = 0.0005

cuda


In [4]:
# Download both datasets
trainset = torchvision.datasets.CIFAR10(
    root='../data', train=True, download=True, transform=data_transform())
testset = torchvision.datasets.CIFAR10(
    root='../data', train=False, download=True, transform=data_transform())

# convert to data loaders
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
# compute mean and std
if not os.path.exists('../data/mean.pt'):
    mean, std = compute_mean_std(trainloader)
    torch.save(mean, '../data/mean.pt')
    torch.save(std, '../data/std.pt')
else:
    mean, std = torch.load('../data/mean.pt'), torch.load('../data/std.pt')

# to numpy
mean, std = mean.numpy(), std.numpy()

print(f"mean: {mean}, std: {std}")

In [None]:
# define classes
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


In [None]:
torch_models = './results/models/' 
if not os.path.exists(torch_models):
    os.makedirs(torch_models)

In [8]:
# load full model
model_full = ConvNN(return_features=True).to(device)
model_full.load_state_dict(torch.load(f'../SUP/results/models/model_100.pth'))

<All keys matched successfully>

## IV. Semi-Supervised Learning: Fixmatch with KMeans Clustering on feature space

### IV.0 KMeans clustering

In [None]:
def perform_KMeans_selection(
        model_full: ConvNN,
        trainloader: torch.utils.data.DataLoader,
        budget: float = 0.01
        ) -> np.ndarray:
    
    # cluster centers
    kmeans = KMeans(n_clusters=10, random_state=0, n_init='auto')
    outputs_list = []

    # train KMeans on batch
    pbar = tqdm(trainloader, total=len(trainloader))
    for i, (train_data) in enumerate(pbar):
        images, labels = train_data

        images = normalize(data=images.unsqueeze(0), mean=mean, std=std)
        images = images.to(device)

        # Forward pass
        outputs = model_full(images).detach().cpu().numpy()
        outputs_list.append(outputs)

    # stack outputs
    outputs_list_array = np.vstack(outputs_list)

    # fit KMeans
    kmeans.fit(outputs_list_array)

    y_pred_list = []

    # predict
    pbar = tqdm(trainloader, total=len(trainloader))
    for i, (train_data) in enumerate(pbar):
        images, labels = train_data

        images = normalize(data=images.unsqueeze(0), mean=mean, std=std)
        images = images.to(device)

        # Forward pass
        outputs = model_full(images).detach().cpu().numpy()

        # predict
        y_pred = kmeans.predict(outputs)

        y_pred_list.append(y_pred)
    
    # stack predictions
    y_pred_array = np.hstack(y_pred_list)

    # compute distance between each data point and cluster centers
    distances = kmeans.transform(outputs_list_array)

    # filter distance to closest cluster center according to prediction
    distances_filtered = np.array(
        [distances[i, y_pred_array[i]] for i in range(len(y_pred_array))]
    )

    # show histogram and horizontal line where X% of the data is
    plt.hist(distances_filtered, bins=100)
    plt.axvline(np.percentile(distances_filtered, 90), color='red', label='90%')
    plt.axvline(np.percentile(distances_filtered, 95), color='blue', label='95%')
    plt.axvline(np.percentile(distances_filtered, 99), color='green', label='99%')
    plt.xlabel('Distance to assigned cluster center')
    plt.ylabel('Number of data points')
    plt.title('Histogram of distances to assigned cluster center')
    plt.legend()
    plt.show()

    # for each class, compute the data point closest to the cluster center
    distances_argmin = []
    for i in range(10):
        distances_argmin.append(np.argmin(distances[:, i], axis=0))

    # images = []
    y_true_list = []

    for i in range(len(trainset)):
        # images.append(trainset[i][0])
        y_true_list.append(trainset[i][1])

    y_true_array = np.array(y_true_list)

    # histogram for each class distribution of true labels
    # for each cluster, plo sthe distribution of distances within the cluster
    fig, ax = plt.subplots(2, 5, figsize=(20, 10))
    for cluster_i in range(10):
        distances_cluster_i = distances[np.where(y_pred_array == cluster_i)[0], cluster_i]
        labels = y_true_array[np.where(y_pred_array == cluster_i)[0]]

        # plot histogram with color according to true label, show the legend
        for i in range(10):
            ax[cluster_i//5, cluster_i%5].hist(distances_cluster_i[np.where(labels == i)[0]], bins=100, color=f'C{i}', alpha=0.5)
        ax[cluster_i//5, cluster_i%5].legend(classes)


        # ax[cluster_i//5, cluster_i%5].hist(distances_cluster_i, bins=100)
        ax[cluster_i//5, cluster_i%5].set_title(f'Cluster: {cluster_i}')

        # mean and median vertical
        ax[cluster_i//5, cluster_i%5].axvline(np.mean(distances_cluster_i), color='red', label='mean')
        ax[cluster_i//5, cluster_i%5].axvline(np.median(distances_cluster_i), color='blue', label='median')

        # show X% of the data
        ax[cluster_i//5, cluster_i%5].axvline(np.percentile(distances_cluster_i, 90), color='red', linestyle='--', label='90%')
        ax[cluster_i//5, cluster_i%5].axvline(np.percentile(distances_cluster_i, 95), color='blue', linestyle='--', label='95%')
        ax[cluster_i//5, cluster_i%5].axvline(np.percentile(distances_cluster_i, 99), color='green', linestyle='--', label='99%')

        ax[cluster_i//5, cluster_i%5].set_xlabel('Distance to assigned cluster center')
        ax[cluster_i//5, cluster_i%5].set_ylabel('Number of data points')
        # ax[cluster_i//5, cluster_i%5].legend()

    plt.tight_layout()
    plt.show()


    # budgets = {0: 730, 1: 100, 2: 730, 3: 200, 4: 730, 5: 200, 6: 730, 7: 750, 8: 730, 9: 100}

    # indices = []
    # for cluster_i in range(10):
    #     distances_cluster_i = distances[np.where(y_pred_array == cluster_i)[0], cluster_i]

    #     # pick top budget i images furthest away from cluster center
    #     indices.append(np.argsort(distances_cluster_i)[::-1][:budgets[cluster_i]])

    # # stack indices
    # indices = np.hstack(indices)

    indices = np.array(0)



    return indices

### IV.1 Fixmatch on 10% train data with KMeans Clustering 

In [26]:
# fix the seed
seedEverything()

SUBSET_PROP = 0.10

# load indices
indices_10 = perform_KMeans_selection(model_full, trainloader, budget=0.10)

trainset_sup = torch.utils.data.Subset(trainset, indices_10)

trainset_unsup, _ = torch.utils.data.random_split(trainset, [1, 0])

labeled_dataloader = DataLoader(
    trainset_sup,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

unlabeled_dataloader = DataLoader(
    trainset_unsup,
    batch_size=MU*BATCH_SIZE,
    shuffle=True,
    num_workers=0
)



In [None]:
EPOCHS = 300

model = ConvNN().to(device)

# criterion and optimizer
labeled_criterion = nn.CrossEntropyLoss(reduction='none')
unlabeled_criterion = nn.CrossEntropyLoss(reduction='none')

optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=BETA, weight_decay=WEIGHT_DECAY, nesterov=True)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

In [None]:
fixmatch_DA_train(
    model=model,
    labeled_dataloader=labeled_dataloader,
    unlabeled_dataloader=unlabeled_dataloader,
    testloader=testloader,
    labeled_criterion=labeled_criterion,
    unlabeled_criterion=unlabeled_criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    epochs=EPOCHS,
    mean=mean,
    std=std,
    lambda_u=LAMBDA_U,
    tau=TAU,
    mu=MU,
    batch_size=BATCH_SIZE,
    name='10_KMEANS'
)

In [None]:
# open results
model_10 = torch.load('./results/models/model_DA_10_KMEANS.pth')
train_losses_10 = torch.load('./results/metrics/train_losses_DA_10_KMEANS.pth')
train_accuracies_10 = torch.load('./results/metrics/train_accuracies_DA_10_KMEANS.pth')
test_losses_10 = torch.load('./results/metrics/test_losses_DA_10_KMEANS.pth')
test_accuracies_10 = torch.load('./results/metrics/test_accuracies_DA_10_KMEANS.pth')

In [None]:
# plot losses and accuracies
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(train_losses_10, label="train")
ax1.plot(test_losses_10, label="test")
ax1.set_title("Loss")

ax2.plot(train_accuracies_10, label="train")
ax2.plot(test_accuracies_10, label="test")
ax2.set_title("Accuracy")

plt.legend()
plt.show()

# save plot
fig.savefig(f"/results/figures/losses_accuracies_DA_10_KMEANS.png")

In [None]:
# plot confusion matrix
model.eval()  # Set the model to evaluation mode
test_correct = 0
test_total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        # normalize
        images = normalize(mean, std)(images)
        
        outputs = model_10(images)
        _, predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()

        y_true.append(labels.cpu().numpy())
        y_pred.append(predicted.cpu().numpy())
    
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    fig, ax = plt.subplots(figsize=(10, 10))
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot(ax=ax)
    plt.tight_layout()
    plt.show()

    # save plot
    fig.savefig(f"figures/confusion_matrix_DA_10_KMEANS.png")

In [None]:
# Evaluation on the test set
test_accuracy = 100.0 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy}%')

test_image, test_labels = testloader.__iter__().__next__()
test_image = test_image.to(device)
test_image = normalize(mean, std)(test_image)
outputs_test = model_10(test_image)
label_pred_test = outputs_test.argmax(dim=1)

fig1 = plot_images(test_image, test_labels, label_pred_test, classes, figure_name=f"Test score with Fixmatch - {int(SUBSET_PROP*100)}% - {test_accuracy:.2f}%")
fig1.savefig(f"./figures/test_score_DA_10_KMEANS.png")

### III.2 Fixmatch on 5% train data

In [None]:
# fix the seed
seedEverything()

SUBSET_PROP = 0.05

# load indices
indices_05 = perform_KMeans_selection(model_full, trainloader, budget=0.05)

trainset_sup = torch.utils.data.Subset(trainset, indices_05)

trainset_unsup, _ = torch.utils.data.random_split(trainset, [1, 0])

labeled_dataloader = DataLoader(
    trainset_sup,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

unlabeled_dataloader = DataLoader(
    trainset_unsup,
    batch_size=MU*BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

In [None]:
EPOCHS = 300

model = ConvNN().to(device)

# criterion and optimizer
labeled_criterion = nn.CrossEntropyLoss(reduction='none')
unlabeled_criterion = nn.CrossEntropyLoss(reduction='none')

optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=BETA, weight_decay=WEIGHT_DECAY, nesterov=True)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

In [None]:
fixmatch_DA_train(
    model=model,
    labeled_dataloader=labeled_dataloader,
    unlabeled_dataloader=unlabeled_dataloader,
    testloader=testloader,
    labeled_criterion=labeled_criterion,
    unlabeled_criterion=unlabeled_criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    epochs=EPOCHS,
    mean=mean,
    std=std,
    lambda_u=LAMBDA_U,
    tau=TAU,
    mu=MU,
    batch_size=BATCH_SIZE,
    name='05_KMEANS'
)

In [None]:
# open results
model_05 = torch.load('./results/models/model_DA_05_KMEANS.pth')
train_losses_05 = torch.load('./results/metrics/train_losses_DA_05_KMEANS.pth')
train_accuracies_05 = torch.load('./results/metrics/train_accuracies_DA_05_KMEANS.pth')
test_losses_05 = torch.load('./results/metrics/test_losses_DA_05_KMEANS.pth')
test_accuracies_05 = torch.load('./results/metrics/test_accuracies_DA_05_KMEANS.pth')

In [None]:
# plot losses and accuracies
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(train_losses_05, label="train")
ax1.plot(test_losses_05, label="test")
ax1.set_title("Loss")

ax2.plot(train_accuracies_05, label="train")
ax2.plot(test_accuracies_05, label="test")
ax2.set_title("Accuracy")

plt.legend()
plt.show()

# save plot
fig.savefig(f"/results/figures/losses_accuracies_DA_05_KMEANS.png")

In [None]:
# plot confusion matrix
model.eval()  # Set the model to evaluation mode
test_correct = 0
test_total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        # normalize
        images = normalize(mean, std)(images)
        
        outputs = model_05(images)
        _, predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()

        y_true.append(labels.cpu().numpy())
        y_pred.append(predicted.cpu().numpy())
    
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    fig, ax = plt.subplots(figsize=(10, 10))
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot(ax=ax)
    plt.tight_layout()
    plt.show()

    # save plot
    fig.savefig(f"figures/confusion_matrix_DA_05_KMEANS.png")

In [None]:
# Evaluation on the test set
test_accuracy = 100.0 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy}%')

test_image, test_labels = testloader.__iter__().__next__()
test_image = test_image.to(device)
test_image = normalize(mean, std)(test_image)
outputs_test = model_05(test_image)
label_pred_test = outputs_test.argmax(dim=1)

fig1 = plot_images(test_image, test_labels, label_pred_test, classes, figure_name=f"Test score with Fixmatch - {int(SUBSET_PROP*100)}% - {test_accuracy:.2f}%")
fig1.savefig(f"./figures/test_score_DA_05_KMEANS.png")

### III.3 Fixmatch on 1% train data

In [None]:
# fix the seed
seedEverything()

SUBSET_PROP = 0.01

# load indices
indices_01 = perform_KMeans_selection(model_full, trainloader, budget=0.01)

trainset_sup = torch.utils.data.Subset(trainset, indices_01)

trainset_unsup, _ = torch.utils.data.random_split(trainset, [1, 0])

labeled_dataloader = DataLoader(
    trainset_sup,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

unlabeled_dataloader = DataLoader(
    trainset_unsup,
    batch_size=MU*BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

In [None]:
EPOCHS = 300

model = ConvNN().to(device)

# criterion and optimizer
labeled_criterion = nn.CrossEntropyLoss(reduction='none')
unlabeled_criterion = nn.CrossEntropyLoss(reduction='none')

optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=BETA, weight_decay=WEIGHT_DECAY, nesterov=True)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

In [None]:
fixmatch_DA_train(
    model=model,
    labeled_dataloader=labeled_dataloader,
    unlabeled_dataloader=unlabeled_dataloader,
    testloader=testloader,
    labeled_criterion=labeled_criterion,
    unlabeled_criterion=unlabeled_criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    epochs=EPOCHS,
    mean=mean,
    std=std,
    lambda_u=LAMBDA_U,
    tau=TAU,
    mu=MU,
    batch_size=BATCH_SIZE,
    name='01_KMEANS'
)

In [None]:
# open results
model_01 = torch.load('./results/models/model_DA_01_KMEANS.pth')
train_losses_01 = torch.load('./results/metrics/train_losses_DA_01_KMEANS.pth')
train_accuracies_01 = torch.load('./results/metrics/train_accuracies_DA_01_KMEANS.pth')
test_losses_01 = torch.load('./results/metrics/test_losses_DA_01_KMEANS.pth')
test_accuracies_01 = torch.load('./results/metrics/test_accuracies_DA_01_KMEANS.pth')

In [None]:
# plot losses and accuracies
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(train_losses_01, label="train")
ax1.plot(test_losses_01, label="test")
ax1.set_title("Loss")

ax2.plot(train_accuracies_01, label="train")
ax2.plot(test_accuracies_01, label="test")
ax2.set_title("Accuracy")

plt.legend()
plt.show()

# save plot
fig.savefig(f"/results/figures/losses_accuracies_DA_01_KMEANS.png")

In [None]:
# plot confusion matrix
model.eval()  # Set the model to evaluation mode
test_correct = 0
test_total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        # normalize
        images = normalize(mean, std)(images)
        
        outputs = model_01(images)
        _, predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()

        y_true.append(labels.cpu().numpy())
        y_pred.append(predicted.cpu().numpy())
    
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    fig, ax = plt.subplots(figsize=(10, 10))
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot(ax=ax)
    plt.tight_layout()
    plt.show()

    # save plot
    fig.savefig(f"figures/confusion_matrix_DA_01_KMEANS.png")

In [None]:
# Evaluation on the test set
test_accuracy = 100.0 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy}%')

test_image, test_labels = testloader.__iter__().__next__()
test_image = test_image.to(device)
test_image = normalize(mean, std)(test_image)
outputs_test = model_01(test_image)
label_pred_test = outputs_test.argmax(dim=1)

fig1 = plot_images(test_image, test_labels, label_pred_test, classes, figure_name=f"Test score with Fixmatch - {int(SUBSET_PROP*100)}% - {test_accuracy:.2f}%")
fig1.savefig(f"./figures/test_score_DA_01_KMEANS.png")