### General Settings

Change the respective settings to run appropriately

Use `limit_train_batches`, `limit_val_batches`, `limit_test_batches` as required

In [1]:
project_dir = '/Users/rajjain/PycharmProjects/ADRL-Course-Work/'
cifar_data_dir = '/Users/rajjain/Desktop/CourseWork/Cifar/'
use_gpu = False
num_cpus = 2

## Imports

In [2]:
from torch.nn import Linear, Sequential, Flatten, Module, init, CrossEntropyLoss, ReLU, BCEWithLogitsLoss
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from pytorch_lightning.utilities.seed import seed_everything
from torchmetrics.functional.classification import accuracy
from torch.utils.data.sampler import SubsetRandomSampler
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import LightningModule
from torch.nn.functional import normalize
from torchvision.datasets import CIFAR10
from torchvision.models import resnet50
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer
from typing import Callable, Optional
from torchvision import transforms
from torch.optim import SGD, Adam
from torchinfo import summary
from datetime import datetime
import torch
import numpy
import gc
import os

## Helpers

In [3]:
five_class_mapper = {
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 3,
    6: 4,
    7: 2,
    8: 0,
    9: 1,
}

two_class_mapper = {
    0: 0,
    1: 0,
    2: 1,
    3: 1,
    4: 1,
    5: 1,
    6: 1,
    7: 0,
    8: 0,
    9: 0,
}

# Models

## Base Models

In [4]:
class FeatureExtractor(Module):
    """Feature extractor for CIFAR-10 dataset"""

    def __init__(self):
        super(FeatureExtractor, self).__init__()
        backbone = resnet50(pretrained=True)
        layers = list(backbone.children())[:-1]
        self.model = Sequential(
            *layers,
            Flatten(),
        )

    def initialise(self):
        pass

    def forward(self, x):
        return self.model(x)  # 2048 dim vectors


class OutputLayer(Module):
    """Get multi-dimensional output from features"""

    def __init__(self, num_classes):
        super(OutputLayer, self).__init__()
        self.num_classes = num_classes
        self.model = Sequential(
            Linear(in_features=2048, out_features=num_classes if num_classes > 2 else 1)
        )

    def initialise(self):
        init.xavier_uniform_(self.model[0].weight)

    def forward(self, x):
        return self.model(x)


class Classifier(LightningModule):

    def __init__(self, feature_extractor: Optional[FeatureExtractor], num_classes: int,
                 target_transform: Optional[Callable], bs: int):
        super(Classifier, self).__init__()
        self.save_hyperparameters()
        self.target_transform = target_transform
        self.bs = bs
        self.learning_rate = 0.01

        seed_everything(0)

        self.output_layer = OutputLayer(num_classes)  # always train the output layer
        self.output_layer.initialise()

        if feature_extractor:  # if provided, don't train
            self.train_features = False
            self.feature_extractor = FeatureExtractor()
            self.feature_extractor.load_state_dict(feature_extractor.state_dict())
            self.feature_extractor.requires_grad_(False)
        else:
            self.train_features = True
            self.feature_extractor = FeatureExtractor()
            self.feature_extractor.initialise()  # by default requires grad is true here

        self.float()

    def forward(self, x):
        features = self.feature_extractor(x)
        return self.output_layer(features)

    def _common_step(self, batch, btype):
        not_training = btype != 'train'
        x, y = batch
        y_hat = self(x)
        if self.output_layer.num_classes > 2:
            loss = CrossEntropyLoss()(y_hat, y)
        else:
            loss = BCEWithLogitsLoss()(y_hat[:, 0], y.float())
        acc = accuracy(y_hat, y, average='macro', num_classes=self.output_layer.num_classes, multiclass=True)
        self.log(f'{btype}/loss', loss, on_step=False, on_epoch=True, sync_dist=not_training)
        self.log(f'{btype}/acc', acc, on_step=False, on_epoch=True, sync_dist=not_training)
        return loss

    def training_step(self, batch, batch_idx):
        self.feature_extractor.train(self.train_features)  # Train feature extractor only if to be trained
        return self._common_step(batch, 'train')

    def validation_step(self, batch, batch_idx):
        self._common_step(batch, 'val')

    def test_step(self, batch, batch_idx):
        self._common_step(batch, 'test')

    def train_dataloader(self):
        ds = CIFAR10(cifar_data_dir, train=True,
                     transform=transforms.Compose([
                         transforms.RandomHorizontalFlip(),
                         transforms.ToTensor(),  # Gives a scaled version i.e., 0 to 1
                         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                     ]),
                     target_transform=self.target_transform)
        dl = DataLoader(ds, self.bs, shuffle=True, num_workers=num_cpus)
        return dl

    def eval_dataloader(self):
        ds = CIFAR10(cifar_data_dir, train=False,
                     transform=transforms.Compose([
                         transforms.ToTensor(),  # Gives a scaled version i.e., 0 to 1
                         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                     ]),
                     target_transform=self.target_transform)
        dl = DataLoader(ds, self.bs, shuffle=False, num_workers=num_cpus)
        return dl

    def val_dataloader(self):
        return self.eval_dataloader()

    def test_dataloader(self):
        return self.eval_dataloader()

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=self.learning_rate)
        # CyclicLR(optimizer, base_lr=0.0001, max_lr=0.01, step_size_up=10, mode='triangular2',
        #          cycle_momentum=False)
        # CosineAnnealingLR(optimizer, T_max=self.trainer.max_epochs)
        # ExponentialLR(optimizer, gamma=0.9)
        lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=1e-4,
                                         threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-8)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val/loss',
                'name': 'learning_rate',
            }
        }

    def summary(self) -> str:
        summary_kwargs = dict(dtypes=[torch.float], depth=3, col_names=['input_size', 'output_size', 'num_params'],
                              row_settings=['depth', 'var_names'], verbose=0, device=torch.device('cpu'))
        imgs = torch.randn((10, 3, 32, 32), dtype=torch.float)
        summary_string = str(summary(model=self, input_data=imgs, **summary_kwargs))
        return summary_string


In [5]:
print(Classifier(None, 10, None, 2).summary())

Global seed set to 0


Layer (type (var_name):depth-idx)                            Input Shape               Output Shape              Param #
Classifier (Classifier)                                      [10, 3, 32, 32]           [10, 10]                  --
├─FeatureExtractor (feature_extractor): 1-1                  [10, 3, 32, 32]           [10, 2048]                --
│    └─Sequential (model): 2-1                               [10, 3, 32, 32]           [10, 2048]                --
│    │    └─Conv2d (0): 3-1                                  [10, 3, 32, 32]           [10, 64, 16, 16]          9,408
│    │    └─BatchNorm2d (1): 3-2                             [10, 64, 16, 16]          [10, 64, 16, 16]          128
│    │    └─ReLU (2): 3-3                                    [10, 64, 16, 16]          [10, 64, 16, 16]          --
│    │    └─MaxPool2d (3): 3-4                               [10, 64, 16, 16]          [10, 64, 8, 8]            --
│    │    └─Sequential (4): 3-5                              [1

## MoCo Models

In [6]:
class TrainingTransforms:
    def __init__(self):
        self.transform = transforms.Compose(
            [
                transforms.RandomResizedCrop(32, scale=(0.2, 1.0)),
                transforms.ColorJitter(0.4, 0.4, 0.4, 0.1),
                transforms.RandomApply([transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))], p=0.8),
                transforms.RandomRotation(degrees=15),
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
            ]
        )

    def __call__(self, inp):
        q = self.transform(inp)
        k = self.transform(inp)
        return q, k


class TestTransforms:
    def __init__(self):
        self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
            ]
        )

    def __call__(self, inp):
        q = self.transform(inp)
        k = self.transform(inp)
        return q, k


class MoCo(LightningModule):
    """Inspired from the original facebook implementation - We won't use multiple GPUs"""

    m = 0.999  # moco momentum
    tau = 0.07  # softmax temperature

    def __init__(self, dict_size: int, bs: int):
        super(MoCo, self).__init__()
        self.save_hyperparameters()

        self.dict_size = dict_size
        self.bs = bs

        # FeatureExtractor will work as encoder. This gives a 2048-dim vector as output
        self.encoder_q = FeatureExtractor()
        self.encoder_k = FeatureExtractor()

        seed_everything(0)

        self.encoder_q.initialise()
        self.encoder_k.load_state_dict(self.encoder_q.state_dict())
        self.encoder_k.requires_grad_(False)

        self.register_buffer("train_queue", torch.randn(2048, dict_size))
        self.train_queue = normalize(self.train_queue, dim=0)
        self.register_buffer("train_queue_ptr", torch.zeros(1, dtype=torch.long))

        self.register_buffer("val_queue", torch.randn(2048, dict_size))
        self.val_queue = normalize(self.val_queue, dim=0)
        self.register_buffer("val_queue_ptr", torch.zeros(1, dtype=torch.long))

    @torch.no_grad()
    def _momentum_update_key_encoder(self):
        """Momentum update of the key encoder"""
        for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
            param_k.data = param_k.data * self.m + param_q.data * (1. - self.m)

    @torch.no_grad()
    def _dequeue_and_enqueue(self, keys, queue, queue_ptr):
        batch_size = keys.shape[0]
        ptr = int(queue_ptr)
        assert self.dict_size % batch_size == 0  # for simplicity
        queue[:, ptr:ptr + batch_size] = keys.T  # replace the keys at ptr (dequeue and enqueue)
        ptr = (ptr + batch_size) % self.dict_size  # move pointer
        queue_ptr[0] = ptr

    def forward(self, query_imgs, key_imgs, queue):
        # compute query features
        q = self.encoder_q(query_imgs)  # queries: NxC
        q = normalize(q, dim=1)

        # compute key features
        with torch.no_grad():  # no gradient to keys
            k = self.encoder_k(key_imgs)  # keys: NxC
            k = normalize(k, dim=1)

        # compute logits - Einstein sum is more intuitive
        l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)  # positive logits: Nx1
        l_neg = torch.einsum('nc,ck->nk', [q, queue.clone().detach()])  # negative logits: NxK
        logits = torch.cat([l_pos, l_neg], dim=1)  # logits: Nx(1+K)
        logits /= self.tau  # apply temperature
        labels = torch.zeros(logits.shape[0], dtype=torch.long, device=self.device)  # labels: positive key indicators

        return logits, labels, k

    def training_step(self, batch, batch_idx):
        self._momentum_update_key_encoder()  # update the key encoder only during training

        (query_imgs, key_imgs), _ = batch
        output, target, keys = self(query_imgs, key_imgs, self.train_queue)
        self._dequeue_and_enqueue(keys, self.train_queue, self.train_queue_ptr)

        loss = CrossEntropyLoss()(output, target)
        pred_class = torch.argmax(output, dim=1)
        acc = (target == pred_class).cpu().numpy().mean()
        self.log(f'train/loss', loss, on_step=False, on_epoch=True, sync_dist=False)
        self.log(f'train/acc', acc, on_step=False, on_epoch=True, sync_dist=False)
        return loss

    def validation_step(self, batch, batch_idx):
        (query_imgs, key_imgs), _ = batch
        output, target, keys = self(query_imgs, key_imgs, self.val_queue)
        self._dequeue_and_enqueue(keys, self.val_queue, self.val_queue_ptr)

        loss = CrossEntropyLoss()(output, target)
        pred_class = torch.argmax(output, dim=1)
        acc = (target == pred_class).cpu().numpy().mean()
        self.log(f'val/loss', loss, on_step=False, on_epoch=True, sync_dist=True)
        self.log(f'val/acc', acc, on_step=False, on_epoch=True, sync_dist=True)
        return loss

    def configure_optimizers(self):
        optimizer = SGD(self.parameters(), lr=0.03, momentum=0.9, weight_decay=1e-4)
        scheduler = CosineAnnealingLR(optimizer, self.trainer.max_epochs)
        return [optimizer], [scheduler]

    def train_dataloader(self):
        ds = CIFAR10(cifar_data_dir, train=True, transform=TrainingTransforms())
        dl = DataLoader(ds, self.bs, shuffle=True, num_workers=0, drop_last=True)
        return dl

    def val_dataloader(self):
        ds = CIFAR10(cifar_data_dir, train=False, transform=TestTransforms())
        dl = DataLoader(ds, self.bs, shuffle=True, num_workers=0, drop_last=True)
        return dl
    
    def summary(self) -> str:
        return ''


class LinearClassifier(LightningModule):

    def __init__(self, feature_extractor: FeatureExtractor, train_fraction: float, bs: int):
        super(LinearClassifier, self).__init__()
        self.save_hyperparameters()
        self.train_fraction = train_fraction
        self.bs = bs
        self.learning_rate = 0.01

        seed_everything(0)

        self.output_layer = OutputLayer(10)
        self.output_layer.initialise()

        self.feature_extractor = FeatureExtractor()
        self.feature_extractor.load_state_dict(feature_extractor.state_dict())
        self.feature_extractor.requires_grad_(False)

    def forward(self, x):
        features = self.feature_extractor(x)
        return self.output_layer(features)

    def _common_step(self, batch, btype):
        not_training = btype != 'train'
        x, y = batch
        y_hat = self(x)
        loss = CrossEntropyLoss()(y_hat, y)
        acc = accuracy(y_hat, y, average='macro', num_classes=self.output_layer.num_classes, multiclass=True)
        self.log(f'{btype}/loss', loss, on_step=False, on_epoch=True, sync_dist=not_training)
        self.log(f'{btype}/acc', acc, on_step=False, on_epoch=True, sync_dist=not_training)
        return loss

    def training_step(self, batch, batch_idx):
        self.feature_extractor.eval()
        return self._common_step(batch, 'train')

    def validation_step(self, batch, batch_idx):
        self._common_step(batch, 'val')

    def train_dataloader(self):
        ds = CIFAR10(cifar_data_dir, train=True,
                     transform=transforms.Compose([
                         transforms.RandomHorizontalFlip(),
                         transforms.ToTensor(),
                         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                     ]))
        total_samples = len(ds)
        req_samples = int(total_samples * self.train_fraction)
        rng = numpy.random.default_rng(0)
        req_idxs = rng.permutation(total_samples)[:req_samples]
        dl = DataLoader(ds, self.bs, num_workers=num_cpus, sampler=SubsetRandomSampler(req_idxs))
        return dl

    def val_dataloader(self):
        ds = CIFAR10(cifar_data_dir, train=False,
                     transform=transforms.Compose([
                         transforms.ToTensor(),
                         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                     ]))
        dl = DataLoader(ds, self.bs, shuffle=False, num_workers=num_cpus)
        return dl

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=self.learning_rate)
        lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=1e-4,
                                         threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-8)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val/loss',
                'name': 'learning_rate',
            }
        }

    def summary(self) -> str:
        summary_kwargs = dict(dtypes=[torch.float], depth=3, col_names=['input_size', 'output_size', 'num_params'],
                              row_settings=['depth', 'var_names'], verbose=0, device=torch.device('cpu'))
        imgs = torch.randn((10, 3, 32, 32), dtype=torch.float)
        summary_string = str(summary(model=self, input_data=imgs, **summary_kwargs))
        return summary_string


# Train and Test

In [7]:
def train_and_test(max_epochs: int, tags: list[str], gpu_num: list[int],
                   model_class, model_kwargs: dict, model_desc: str,
                   limit_train_batches=1.0, limit_val_batches=1.0, limit_test_batches=1.0):
    seed_everything(0, workers=True)

    folder_name = f'run_{datetime.utcnow().isoformat(sep="T", timespec="microseconds")}'
    results_dir = project_dir + f'self_supervised/results/{folder_name}/'
    os.makedirs(results_dir, exist_ok=False)

    checkpoint_callback = ModelCheckpoint(monitor='val/loss', mode='min', dirpath=results_dir, filename='best',
                                          save_last=True)
    
    model = model_class(**model_kwargs)

    trainer_kwargs = dict(accelerator="gpu", devices=gpu_num) if use_gpu else dict()
    tf_logger = TensorBoardLogger(save_dir=results_dir, version=f'tf_logs', default_hp_metric=False)
    trainer = Trainer(default_root_dir=results_dir, max_epochs=max_epochs, callbacks=[checkpoint_callback],
                      logger=[tf_logger], log_every_n_steps=1, num_sanity_val_steps=0, deterministic=True,
                      limit_train_batches=limit_train_batches, limit_val_batches=limit_val_batches,
                      limit_test_batches=limit_test_batches,
                      **trainer_kwargs)
    trainer.fit(model)

    summary = model.summary() + '\n' + model_desc
    with open(results_dir + 'model_desc.md', 'w') as f:
        f.write(summary)

    gc.collect()
    return folder_name


## Base Model Runs

In [8]:
base_dir = train_and_test(max_epochs=2, tags=[], gpu_num=[], model_class=Classifier, 
                          model_kwargs=dict(feature_extractor=None, num_classes=10, target_transform=None, bs=10), 
                          model_desc='Base Model', limit_train_batches=2, limit_val_batches=2, limit_test_batches=2)

base_model = Classifier.load_from_checkpoint(project_dir + f'self_supervised/results/{base_dir}/best.ckpt')

_ = train_and_test(max_epochs=2, tags=[], gpu_num=[], model_class=Classifier,
                   model_kwargs=dict(feature_extractor=base_model.feature_extractor, num_classes=5, 
                                     target_transform=five_class_mapper.get, bs=10),
                   model_desc='5-class Model', limit_train_batches=2, limit_val_batches=2, limit_test_batches=2)

_ = train_and_test(max_epochs=2, tags=[], gpu_num=[], model_class=Classifier,
                   model_kwargs=dict(feature_extractor=base_model.feature_extractor, num_classes=2,
                                     target_transform=two_class_mapper.get, bs=10),
                   model_desc='2-class Model', limit_train_batches=2, limit_val_batches=2, limit_test_batches=2)


Global seed set to 0
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name              | Type             | Params
-------------------------------------------------------
0 | output_layer      | OutputLayer      | 20.5 K
1 | feature_extractor | FeatureExtractor | 23.5 M
-------------------------------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Global seed set to 0
Global seed set to 0
  rank_zero_warn(
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name              | Type             | Params
-------------------------------------------------------
0 | output_layer      | OutputLayer      | 10.2 K
1 | feature_extractor | FeatureExtractor | 23.5 M
-------------------------------------------------------
10.2 K    Trainable params
23.5 M    Non-trainable params
23.5 M    Total params
94.073    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Global seed set to 0
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name              | Type             | Params
-------------------------------------------------------
0 | output_layer      | OutputLayer      | 2.0 K 
1 | feature_extractor | FeatureExtractor | 23.5 M
-------------------------------------------------------
2.0 K     Trainable params
23.5 M    Non-trainable params
23.5 M    Total params
94.040    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

## MoCo Model Runs

In [9]:
dict_size = 40
train_fraction = 0.1

In [10]:
moco_dir = train_and_test(max_epochs=2, tags=[], gpu_num=[], model_class=MoCo,
                          model_kwargs=dict(dict_size=dict_size, bs=10), model_desc='MoCo Model Training', 
                          limit_train_batches=2, limit_val_batches=2, limit_test_batches=2)

moco_model = MoCo.load_from_checkpoint(project_dir + f'self_supervised/results/{moco_dir}/best.ckpt')
_ = train_and_test(max_epochs=2, tags=[], gpu_num=[], model_class=LinearClassifier,
                   model_kwargs=dict(feature_extractor=moco_model.encoder_q, train_fraction=train_fraction, bs=10),
                   model_desc='MoCo Classifier Training', 
                   limit_train_batches=2, limit_val_batches=2, limit_test_batches=2)


Global seed set to 0
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder_q | FeatureExtractor | 23.5 M
1 | encoder_k | FeatureExtractor | 23.5 M
-----------------------------------------------
23.5 M    Trainable params
23.5 M    Non-trainable params
47.0 M    Total params
188.064   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Global seed set to 0
Global seed set to 0
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name              | Type             | Params
-------------------------------------------------------
0 | output_layer      | OutputLayer      | 20.5 K
1 | feature_extractor | FeatureExtractor | 23.5 M
-------------------------------------------------------
20.5 K    Trainable params
23.5 M    Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# t-SNE Plot

In [12]:
mapper = {
    0: 'airplane',
    1: 'automobile',
    2: 'bird',
    3: 'cat',
    4: 'deer',
    5: 'dog',
    6: 'frog',
    7: 'horse',
    8: 'ship',
    9: 'truck',
}


def tsne_plot(model_dir, target_transform, num_classes):
    bs = 50

    ds = CIFAR10(cifar_data_dir, train=False,
                 transform=transforms.Compose([
                     transforms.ToTensor(),  # Gives a scaled version i.e., 0 to 1
                     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                 ]),
                 target_transform=target_transform)
    dl = DataLoader(ds, bs, shuffle=False, num_workers=num_cpus)

    model = Classifier.load_from_checkpoint(project_dir + f'self_supervised/results/{model_dir}/best.ckpt')
    assert num_classes == model.output_layer.num_classes
    model.eval()

    all_features, all_ys = [], []
    for x, y in tqdm(iter(dl)):
        features = model.feature_extractor(x)
        all_features.append(features.detach().numpy())
        all_ys.append(y.numpy())

    final_features = numpy.concatenate(all_features)
    final_y = numpy.concatenate(all_ys)
    if num_classes == 10:
        final_y = list(map(mapper.get, final_y))
        order = list(map(mapper.get, range(num_classes)))
    else:
        final_y = list(map(str, final_y))
        order = list(map(str, range(num_classes)))

    tsne = TSNE(n_components=2, random_state=0, init='pca', learning_rate='auto')
    projections = tsne.fit_transform(final_features)
    fig = px.scatter(projections, x=0, y=1, color=final_y,
                     labels={'color': 'True Class', '0': 'Dimension 1', '1': 'Dimension 2'},
                     category_orders={'color': order})
    fig.update_layout(width=850, height=850, font_color="black", font_size=15)
    fig.update_xaxes(title_standoff=0)
    fig.update_yaxes(title_standoff=0)
    fig.show()