In [1]:
from google.colab import files
files.upload()  # Upload kaggle.json when prompted

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mt0010","key":"bc8eb0133e92bbc5e2f73f99ce5118e0"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d wilyzh/imagenet100 --unzip
!ls imagenet100

Dataset URL: https://www.kaggle.com/datasets/wilyzh/imagenet100
License(s): unknown
Downloading imagenet100.zip to /content
 99% 13.9G/14.1G [00:46<00:00, 144MB/s] 
100% 14.1G/14.1G [00:46<00:00, 326MB/s]
ls: cannot access 'imagenet100': No such file or directory


In [3]:
from torchvision import datasets, transforms, models
train_full = datasets.ImageFolder(root='ImageNet100/train')
val_full = datasets.ImageFolder(root='ImageNet100/val')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
from typing import Any, Dict, List, Tuple
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset,TensorDataset
import os
import random
import numpy as np
from PIL import Image
from torch.amp import autocast, GradScaler
from typing import Any, Dict

# Configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
INPUT_SIZE = 224  #  image size
BATCH_SIZE = 16  # 4 rotations × 16 = 66 processed images
NUM_EPOCHS_PER_TASK = 20
LEARNING_RATE = 1e-4
LAMBDA_CASSLE = 3 # Weight for CaSSle loss
NUM_CLASSES_PER_TASK = 10
NUM_TOTAL_CLASSES = 100  # Total  classes
NUM_ROT_CLASSES = 4  # 0°, 90°, 180°, 270°
LINEAR_EVAL_EPOCHS = 5
LINEAR_EVAL_BATCH_SIZE = 128

torch.cuda.empty_cache()

class RotNetImageNet100Dataset(Dataset):
    def __init__(self, imagenet_dataset, class_list, base_transform):
        self.imagenet_dataset = imagenet_dataset
        self.class_set = set(class_list)
        self.valid_indices = []

        for i, (_, label) in enumerate(imagenet_dataset.imgs):  # Use imgs attribute directly
            if label in self.class_set:
                self.valid_indices.append(i)

        self.base_transform = base_transform

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        real_idx = self.valid_indices[idx]
        img_path, original_label = self.imagenet_dataset.imgs[real_idx]
        img = Image.open(img_path).convert('RGB')

        rotated_imgs = []
        rotation_labels = []

        for angle, rot_label in zip([0, 90, 180, 270], range(4)):
            rotated_img = transforms.functional.rotate(img, angle)
            rotated_img = self.base_transform(rotated_img)
            rotated_imgs.append(rotated_img)
            rotation_labels.append(torch.tensor(rot_label, dtype=torch.long))

        return torch.stack(rotated_imgs), torch.stack(rotation_labels), original_label


# --- RotNet Model using Custom Backbone (no changes needed here) ---
class RotNetModel(nn.Module):
    def __init__(self, num_rot_classes: int = 4, backbone: str = 'pretrained_resnet18'):
        super().__init__()

        if backbone == 'pretrained_resnet18':
            # ImageNet-pretrained ResNet18
            self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
            self.backbone.fc = nn.Identity()
            self.features_dim = 512

        elif backbone == 'places_resnet18':
            # Load ResNet18 pretrained on Places365
            self.backbone = models.resnet18(num_classes=365)
            checkpoint = torch.hub.load_state_dict_from_url(
                'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar',
                map_location='cpu'
            )
            state_dict = {k.replace('module.', ''): v for k, v in checkpoint['state_dict'].items()}
            self.backbone.load_state_dict(state_dict)
            self.backbone.fc = nn.Identity()
            self.features_dim = 512

        else:
            # Use custom backbone if specified
            self.backbone = CustomBackbone()
            self.features_dim = self.backbone.features_dim

        self.classifier = nn.Sequential(
            nn.Linear(self.features_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_rot_classes)
        )

    def forward(self, x: torch.Tensor) -> Dict[str, Any]:
        features = self.backbone(x)
        features_flat = features.view(features.size(0), -1)
        logits = self.classifier(features_flat)
        return {
            'logits': logits,
            'features': features_flat
        }

    def calculate_ssl_loss(self, logits: torch.Tensor, rot_labels: torch.Tensor) -> torch.Tensor:
        return F.cross_entropy(logits, rot_labels.long())

In [5]:
class CaSSLePredictor(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),  # Add BatchNorm
            nn.ReLU(),
            nn.Dropout(0.3),  # Add Dropout for regularization
            nn.Linear(hidden_dim, hidden_dim // 2),  # Additional layer
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim // 2, output_dim),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

In [None]:
class CaSSleTrainer:
    def __init__(self, base_ssl_model: RotNetModel,
                 ca_predictor_hidden_dim: int,
                 learning_rate: float = 1e-4, lambda_cassle: float = 0.1, device: str = 'cuda'):

        self.base_ssl_model = base_ssl_model.to(device) # This is f_t + rotation_head
        self.lambda_cassle = lambda_cassle
        self.device = device

        # Input and output dimensions for CaSSLe Predictor are the backbone's feature dimension
        predictor_input_output_dim = self.base_ssl_model.features_dim

        # Initialize the current CaSSLe predictor
        self.g_current = CaSSLePredictor(
            predictor_input_output_dim,
            ca_predictor_hidden_dim,
            predictor_input_output_dim
        ).to(device)

        # Optimizer for ALL trainable parameters: current RotNet model (f_t + head) AND predictor g
        self.optimizer = torch.optim.AdamW([
            {'params': self.base_ssl_model.backbone.parameters(), 'lr': learning_rate * 0.1},  # Lower LR for backbone
            {'params': self.base_ssl_model.classifier.parameters(), 'lr': learning_rate},
            {'params': self.g_current.parameters(), 'lr': learning_rate}
        ], weight_decay=0.01)

        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, T_max=50, eta_min=1e-6
        )


        # This will hold the frozen previous encoder (f_{t-1}^{frozen})
        self.f_frozen_teacher = None

    def set_previous_frozen_encoder(self, encoder_state_dict: Dict[str, Any]):
        """
        Loads the state of the previous encoder (f_{t-1}) and freezes it.
        """
        # Create new backbone with same configuration as original
        self.f_frozen_teacher = copy.deepcopy(self.base_ssl_model.backbone)
        self.f_frozen_teacher.to(self.device)

        # Load state dict
        self.f_frozen_teacher.load_state_dict(encoder_state_dict)

        # Freeze the parameters
        for param in self.f_frozen_teacher.parameters():
            param.requires_grad = False

        print(f"Frozen encoder (f_t-1) loaded and parameters frozen: {all(not p.requires_grad for p in self.f_frozen_teacher.parameters())}")


    def train_task(self, data_loader: torch.utils.data.DataLoader, epochs: int):
        self.base_ssl_model.train() # f_t + rotation_head is trainable
        self.g_current.train() # g is trainable

        # Set f_frozen_teacher to eval mode to disable dropout/batchnorm updates for teacher
        if self.f_frozen_teacher:
            self.f_frozen_teacher.eval()

        print(f"Distilling from frozen teacher encoder (f_t-1): {self.f_frozen_teacher is not None}")

        best_loss = float('inf')
        patience = 3
        patience_counter = 0
        min_delta = 0.001

        for epoch in range(epochs):
            total_ssl_loss = 0
            total_cassle_loss = 0
            total_loss = 0

            for batch_idx, batch in enumerate(data_loader):
                rotated_imgs, rotation_labels, original_labels = batch[:3] # _ for original_label
                self.optimizer.zero_grad()

                # rotated_imgs: (batch_size, 4, C, H, W)
                # rotation_labels: (batch_size, 4)

                # Flatten the batch and rotation dimensions for model input
                imgs_flat = rotated_imgs.view(-1, *rotated_imgs.shape[2:]).to(self.device)
                labels_flat = rotation_labels.view(-1).to(self.device)
                self.scaler = GradScaler()
                with autocast(device_type='cuda'):
                    #  Forward Pass through the current trainable RotNet model (f_t + head)
                    ssl_output = self.base_ssl_model(imgs_flat)

                    # Calculate Base Self-Supervised Loss  (Cross-Entropy)
                    loss_ssl = self.base_ssl_model.calculate_ssl_loss(ssl_output['logits'], labels_flat)

                    # Calculate CaSSle Distillation Loss (L_D)
                    loss_cassle = torch.tensor(0.0).to(self.device) # Initialize to 0 for the first task

                    if self.f_frozen_teacher:
                        # Get features from the *frozen previous encoder* (f_{t-1}^{frozen})
                        with torch.no_grad():
                            features_from_frozen = self.f_frozen_teacher(imgs_flat)

                        # Student predictions from current trainable 'g'
                        # g takes features from current f_t
                        student_pred = self.g_current(ssl_output['features'])
                        teacher_target = features_from_frozen

                        # Normalize both before cosine similarity
                        student_pred_norm = F.normalize(student_pred, dim=-1)
                        teacher_target_norm = F.normalize(teacher_target, dim=-1)

                        # Compute distillation loss 
                        loss_cassle = (1 - F.cosine_similarity(student_pred_norm, teacher_target_norm, dim=-1).mean()) * self.lambda_cassle

                    # --- Total Loss and Optimization ---
                    loss = loss_ssl +  loss_cassle

                self.scaler.scale(loss).backward()
                self.scaler.step(self.optimizer)
                self.scaler.update()

                total_ssl_loss += loss_ssl.item()
                total_cassle_loss += loss_cassle.item()
                total_loss += loss.item()

            #Early stopping
            avg_loss = total_loss / len(data_loader)
            #scheduler step
            self.scheduler.step()

            if avg_loss < best_loss - min_delta:
                best_loss = avg_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

            print(f"Epoch {epoch+1}/{epochs} - SSL Loss: {total_ssl_loss / len(data_loader):.4f}, "
                  f"CaSSle Loss: {total_cassle_loss / len(data_loader):.4f}, "
                  f"Total Loss: {total_loss / len(data_loader):.4f}")

        # State_dict of the current RotNet model's backbone (f_t).
        return self.base_ssl_model.backbone.state_dict()

In [None]:

from datasets import load_dataset


#Evaluation Function
class LinearEvalDataset(Dataset):
    """Pre-filter indices for better performance"""
    def __init__(self, original_dataset, class_list, transform):
        if hasattr(original_dataset, 'targets'):
            targets = original_dataset.targets
        else:
            targets = [label for _, label in original_dataset]

        # Pre-filter all indices at once
        self.indices = [i for i, label in enumerate(targets) if label in class_list]
        self.original_dataset = original_dataset
        self.transform = transform
        print(f"Filtered dataset: {len(self.indices)} samples from {len(class_list)} classes")

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        img, label = self.original_dataset[self.indices[idx]]
        if self.transform:
            img = self.transform(img)
        return img, label

def extract_features_once(feature_extractor, dataset, batch_size=128, device=torch.device("cuda")):
    """Pre-extract all features once and cache them"""
    feature_extractor.eval()

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False,
                           num_workers=4, pin_memory=True)

    all_features = []
    all_labels = []

    with torch.no_grad():
        for img_batch, label_batch in dataloader:
            img_batch = img_batch.to(device)
            features = feature_extractor(img_batch).view(img_batch.size(0), -1)

            all_features.append(features.cpu())  # Move to CPU to save GPU memory
            all_labels.append(label_batch)

    return torch.cat(all_features, dim=0), torch.cat(all_labels, dim=0)

def evaluate_model(feature_extractor: torch.nn.Module,
                       all_seen_classes: List[int],
                       train_full,
                       val_full,
                       base_transform,
                       batch_size: int = 128,
                       linear_eval_epochs: int = 100,
                       device: torch.device = torch.device("cuda")):

    # Freeze feature extractor
    feature_extractor.eval()
    for param in feature_extractor.parameters():
        param.requires_grad = False

    # Create datasets
    train_linear_dataset = LinearEvalDataset(train_full, all_seen_classes, base_transform)
    val_linear_dataset = LinearEvalDataset(val_full, all_seen_classes, base_transform)

    # PRE-EXTRACT FEATURES ONCE
    print("Extracting training features...")
    train_features, train_labels = extract_features_once(feature_extractor, train_linear_dataset, batch_size, device)
    print("Extracting validation features...")
    val_features, val_labels = extract_features_once(feature_extractor, val_linear_dataset, batch_size, device)

    # Map labels to contiguous range
    label_to_contiguous_map = {label: i for i, label in enumerate(sorted(all_seen_classes))}
    train_labels_mapped = torch.tensor([label_to_contiguous_map[l.item()] for l in train_labels])
    val_labels_mapped = torch.tensor([label_to_contiguous_map[l.item()] for l in val_labels])

    # Create feature datasets
    train_feature_dataset = TensorDataset(train_features, train_labels_mapped)
    val_feature_dataset = TensorDataset(val_features, val_labels_mapped)

    train_loader = DataLoader(train_feature_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_feature_dataset, batch_size=batch_size, shuffle=False)

    # Initialize classifier
    features_dim = train_features.shape[1]
    num_output_classes = len(all_seen_classes)

    linear_classifier = nn.Sequential(
        nn.Linear(features_dim, 256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, num_output_classes)
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(linear_classifier.parameters(), lr=1e-3, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=linear_eval_epochs)

    # Train classifier (much faster now - no feature extraction)
    linear_classifier.train()
    for epoch in range(linear_eval_epochs):
        for features_batch, labels_batch in train_loader:
            features_batch = features_batch.to(device)
            labels_batch = labels_batch.to(device)

            optimizer.zero_grad()
            outputs = linear_classifier(features_batch)
            loss = criterion(outputs, labels_batch)
            loss.backward()
            optimizer.step()
        scheduler.step()

    # Evaluate
    linear_classifier.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for features_batch, labels_batch in val_loader:
            features_batch = features_batch.to(device)
            labels_batch = labels_batch.to(device)

            outputs = linear_classifier(features_batch)
            _, predicted = torch.max(outputs, 1)
            total_samples += labels_batch.size(0)
            total_correct += (predicted == labels_batch).sum().item()

    accuracy = 100 * total_correct / total_samples
    print(f"Linear evaluation accuracy on {len(all_seen_classes)} classes: {accuracy:.2f}%")

    # Restore feature extractor
    for param in feature_extractor.parameters():
        param.requires_grad = True
    feature_extractor.train()

    return accuracy

# Calculate all random baselines once at the beginning
def calculate_all_random_baselines(task_class_splits, train_full, val_full, base_transform, device):
    """Calculate random baselines for all tasks once"""
    random_accuracies = {}

    # Create a single random model for all evaluations
    random_model = models.resnet18(weights=None)
    random_model.fc = nn.Identity()
    random_model.to(device)
    random_model.eval()

    try:
        for task_idx, class_list in enumerate(task_class_splits):
            print(f"Calculating random baseline for Task {task_idx+1}")
            random_acc = evaluate_model(
                random_model, class_list, train_full, val_full, base_transform,
                batch_size=128, linear_eval_epochs=5, device=device  # Fewer epochs for random
            )
            random_accuracies[task_idx] = random_acc
    finally:
        del random_model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()




    return random_accuracies



def set_seed(seed: int = 42):
    import random, os
    import numpy as np
    import torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    print(f"Random seed set to {seed}")

set_seed(42)


# Transforms
base_transform = transforms.Compose([
    transforms.Resize(224),          
    transforms.CenterCrop(224),       
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])



# Task Split
#all_classes_shuffled = list(range(NUM_TOTAL_CLASSES))
#random.shuffle(all_classes_shuffled)

#task_class_splits = [all_classes_shuffled[i:i + NUM_CLASSES_PER_TASK] for i in range(0, NUM_TOTAL_CLASSES, NUM_CLASSES_PER_TASK)]
task_class_splits = [
    [42, 41, 91, 9, 65, 50, 1, 70, 15, 78],
    [73, 10, 55, 56, 72, 45, 48, 92, 76, 37],
    [30, 21, 32, 96, 80, 49, 83, 26, 87, 33],
    [8, 47, 59, 63, 74, 44, 98, 52, 85, 12],
    [36, 23, 39, 40, 18, 66, 61, 60, 7, 34],
    [99, 46, 2, 51, 16, 38, 58, 68, 22, 62],
    [24, 5, 6, 67, 82, 19, 79, 43, 90, 20],
    [0, 95, 57, 93, 53, 89, 25, 71, 84, 77],
    [64, 29, 27, 88, 97, 4, 54, 75, 11, 69],
    [86, 13, 17, 28, 31, 35, 94, 3, 14, 81]
]
task_datasets = []
for i, class_list in enumerate(task_class_splits):
    print(f"Task {i+1} includes classes: {class_list}")
    task_dataset = RotNetImageNet100Dataset(train_full, class_list, base_transform)
    task_datasets.append(task_dataset)

# Init Model
resnet18_backbone = models.resnet18(weights=None)
resnet18_backbone.fc = nn.Identity()

# Initialize RotNet with pretrained ResNet18
base_ssl_model_instance = RotNetModel(
    num_rot_classes=NUM_ROT_CLASSES,
    backbone='places_resnet18'  # Add this parameter to RotNetModel
).to(DEVICE)
prev_encoder_state_dict = None

# Training Loop
all_task_accuracies = []
random_accuracies_Ri = {}

print("Calculating random baselines for all tasks...")
random_accuracies_Ri = calculate_all_random_baselines(
    task_class_splits, train_full, val_full, base_transform, DEVICE
)
print("Random baselines calculated!")

# STEP 2: Modified training loop
all_task_accuracies = []

for task_id, current_task_dataset in enumerate(task_datasets):
    print(f"\n===== Training Task {task_id + 1}/{len(task_datasets)} =====")

    current_task_loader = DataLoader(
        current_task_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=2, pin_memory=True,
        prefetch_factor=2
    )

    trainer = CaSSleTrainer(
        base_ssl_model=base_ssl_model_instance,
        ca_predictor_hidden_dim=1024,
        learning_rate=LEARNING_RATE,
        lambda_cassle=LAMBDA_CASSLE,
        device=DEVICE
    )

    # Save previous encoder
    if prev_encoder_state_dict:
        trainer.set_previous_frozen_encoder(prev_encoder_state_dict)

    # Train encoder on current task and save its state_dict
    prev_encoder_state_dict = trainer.train_task(current_task_loader, NUM_EPOCHS_PER_TASK)

    print(f"\n--- Evaluating after Task {task_id + 1} ---")
    current_seen_classes = sorted(set().union(*task_class_splits[:task_id + 1]))
    accuracies_after_this_task = []

    # Evaluate model for each task seen so far
    for eval_task_idx in range(task_id + 1):
        eval_task_classes = task_class_splits[eval_task_idx]
        print(f"  Evaluating on classes from Task {eval_task_idx+1}: {eval_task_classes}")

        # USE THE FAST EVALUATION FUNCTION
        acc_jk = evaluate_model(  # Changed from evaluate_model to evaluate_model_fast
            base_ssl_model_instance.backbone,
            eval_task_classes,
            train_full,
            val_full,
            base_transform,
            LINEAR_EVAL_BATCH_SIZE,
            LINEAR_EVAL_EPOCHS,  # Consider reducing this to 10-20
            DEVICE
        )
        accuracies_after_this_task.append(acc_jk)

        # Random baseline is already calculated - just print it
        random_acc = random_accuracies_Ri[eval_task_idx]
        print(f"    Random baseline for Task {eval_task_idx+1}: {random_acc:.2f}%")
        print(f"    Current accuracy for Task {eval_task_idx+1}: {acc_jk:.2f}%")
        print(f"    Improvement over random: {acc_jk - random_acc:.2f}%")

    all_task_accuracies.append(accuracies_after_this_task)

    # Optional: Clear GPU cache after each task
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Rest of your metrics calculation remains the same
T = len(task_datasets)

# Average Accuracy
final_accuracies_row = all_task_accuracies[T-1]
avg_accuracy = sum(final_accuracies_row) / T
print(f"\nFinal Average Accuracy (A): {avg_accuracy:.2f}%")

# Calculate average random baseline for comparison
avg_random_accuracy = sum(random_accuracies_Ri.values()) / len(random_accuracies_Ri)
print(f"Average Random Baseline: {avg_random_accuracy:.2f}%")
print(f"Improvement over Random: {avg_accuracy - avg_random_accuracy:.2f}%")

# Forgetting calculation (unchanged)
forgetting = 0
if T > 1:
    for i in range(T - 1):
        max_acc = max(all_task_accuracies[t][i] for t in range(T) if i < len(all_task_accuracies[t]))
        final_acc = all_task_accuracies[T-1][i]
        forgetting += (max_acc - final_acc)
    forgetting /= (T - 1)
print(f"Final Forgetting (F): {forgetting:.2f}%")

# Backward Transfer calculation (unchanged)
backward_transfer = 0
count = 0

if T > 1:
    for new_task in range(1, T):
        for old_task in range(new_task):
            if old_task < len(all_task_accuracies[new_task - 1]) and old_task < len(all_task_accuracies[new_task]):
                acc_before = all_task_accuracies[new_task - 1][old_task]
                acc_after = all_task_accuracies[new_task][old_task]
                backward_transfer += (acc_after - acc_before)
                count += 1
            else:
                print(f"Skipping BT for old_task {old_task+1}, new_task {new_task+1}: missing data")

    backward_transfer /= count if count > 0 else 1
else:
    backward_transfer = 0

print(f"Final Backward Transfer (BT): {backward_transfer:.2f}%")

Random seed set to 42
Task 1 includes classes: [42, 41, 91, 9, 65, 50, 1, 70, 15, 78]
Task 2 includes classes: [73, 10, 55, 56, 72, 45, 48, 92, 76, 37]
Task 3 includes classes: [30, 21, 32, 96, 80, 49, 83, 26, 87, 33]
Task 4 includes classes: [8, 47, 59, 63, 74, 44, 98, 52, 85, 12]
Task 5 includes classes: [36, 23, 39, 40, 18, 66, 61, 60, 7, 34]
Task 6 includes classes: [99, 46, 2, 51, 16, 38, 58, 68, 22, 62]
Task 7 includes classes: [24, 5, 6, 67, 82, 19, 79, 43, 90, 20]
Task 8 includes classes: [0, 95, 57, 93, 53, 89, 25, 71, 84, 77]
Task 9 includes classes: [64, 29, 27, 88, 97, 4, 54, 75, 11, 69]
Task 10 includes classes: [86, 13, 17, 28, 31, 35, 94, 3, 14, 81]


Downloading: "http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar" to /root/.cache/torch/hub/checkpoints/resnet18_places365.pth.tar
100%|██████████| 43.4M/43.4M [00:04<00:00, 9.96MB/s]


Calculating random baselines for all tasks...
Calculating random baseline for Task 1
Filtered dataset: 12797 samples from 10 classes
Filtered dataset: 500 samples from 10 classes
Extracting training features...
Extracting validation features...
Linear evaluation accuracy on 10 classes: 20.40%
Calculating random baseline for Task 2
Filtered dataset: 12856 samples from 10 classes
Filtered dataset: 500 samples from 10 classes
Extracting training features...
Extracting validation features...
Linear evaluation accuracy on 10 classes: 27.80%
Calculating random baseline for Task 3
Filtered dataset: 12821 samples from 10 classes
Filtered dataset: 500 samples from 10 classes
Extracting training features...
Extracting validation features...
Linear evaluation accuracy on 10 classes: 22.60%
Calculating random baseline for Task 4
Filtered dataset: 12964 samples from 10 classes
Filtered dataset: 500 samples from 10 classes
Extracting training features...
Extracting validation features...
Linear eval