Ensure latest version of package is installed

In [1]:
%pip install sas-pip/

Processing ./sas-pip
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sas
  Building wheel for sas (setup.py) ... [?25ldone
[?25h  Created wheel for sas: filename=sas-1.0-py3-none-any.whl size=6289 sha256=6e8f8d3141702ae426b4a9635e99beaa3da3ecf8c32cedb7dcc76cad8522aca4
  Stored in directory: /home/sjoshi/.cache/pip/wheels/4e/07/53/a089817b38c15451794418a74eb8812ee557a2982d04e9d60a
Successfully built sas
Installing collected packages: sas
  Attempting uninstall: sas
    Found existing installation: sas 1.0
    Uninstalling sas-1.0:
      Successfully uninstalled sas-1.0
Successfully installed sas-1.0
Note: you may need to restart the kernel to use updated packages.


Load Data

In [6]:
import torchvision
from torchvision import transforms
from torchvision.datasets.vision import VisionDataset
from typing import Optional, Callable, Tuple, Any
import os
import pickle
import numpy as np
import os.path
from PIL import Image


class CIFAR10(VisionDataset):
    """`CIFAR10 <https://www.cs.toronto.edu/~kriz/cifar.html>`_ Dataset.

    Args:
        root (string): Root directory of dataset where directory
            ``cifar-10-batches-py`` exists or will be saved to if download is set to True.
        train (bool, optional): If True, creates dataset from training set, otherwise
            creates from test set.
        transform (callable, optional): A function/transform that takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.

    """
    base_folder = 'cifar-10-batches-py'
    url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    filename = "cifar-10-python.tar.gz"
    tgz_md5 = 'c58f30108f718f92721af3b95e74349a'
    train_list = [
        ['data_batch_1', 'c99cafc152244af753f735de768cd75f'],
        ['data_batch_2', 'd4bba439e000b95fd0a9bffe97cbabec'],
        ['data_batch_3', '54ebc095f3ab1f0389bbae665268c751'],
        ['data_batch_4', '634d18415352ddfa80567beed471001a'],
        ['data_batch_5', '482c414d41f54cd18b22e5b47cb7c3cb'],
    ]

    test_list = [
        ['test_batch', '40351d587109b95175f43aff81a1287e'],
    ]
    meta = {
        'filename': 'batches.meta',
        'key': 'label_names',
        'md5': '5ff9c542aee3614f3951f8cda6e48888',
    }

    def __init__(
            self,
            root: str,
            train: bool = True,
            transform: Optional[Callable] = None,
            target_transform: Optional[Callable] = None,
            download: bool = False,
    ) -> None:

        super(CIFAR10, self).__init__(root, transform=transform,
                                      target_transform=target_transform)

        self.train = train  # training set or test set

        if download:
            self.download()

        # if not self._check_integrity():
        #     raise RuntimeError('Dataset not found or corrupted.' +
        #                        ' You can use download=True to download it')

        if self.train:
            downloaded_list = self.train_list
        else:
            downloaded_list = self.test_list

        self.data: Any = []
        self.targets = []

        # now load the picked numpy arrays
        for file_name, checksum in downloaded_list:
            file_path = os.path.join(self.root, self.base_folder, file_name)
            with open(file_path, 'rb') as f:
                entry = pickle.load(f, encoding='latin1')
                self.data.append(entry['data'])
                if 'labels' in entry:
                    self.targets.extend(entry['labels'])
                else:
                    self.targets.extend(entry['fine_labels'])

        self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
        self.data = self.data.transpose((0, 2, 3, 1))  # convert to HWC

        self._load_meta()

    def _load_meta(self) -> None:
        path = os.path.join(self.root, self.base_folder, self.meta['filename'])
        # if not check_integrity(path, self.meta['md5']):
        #     raise RuntimeError('Dataset metadata file not found or corrupted.' +
        #                        ' You can use download=True to download it')
        with open(path, 'rb') as infile:
            data = pickle.load(infile, encoding='latin1')
            self.classes = data[self.meta['key']]
        self.class_to_idx = {_class: i for i, _class in enumerate(self.classes)}

    def __getitem__(self, index: int) -> Tuple[Any, Any]:
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        img, target = self.data[index], self.targets[index]

        # doing this so that it is consistent with all other datasets
        # to return a PIL Image
        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self) -> int:
        return len(self.data)

    def _check_integrity(self) -> bool:
        root = self.root
        for fentry in (self.train_list + self.test_list):
            filename, md5 = fentry[0], fentry[1]
            fpath = os.path.join(root, self.base_folder, filename)
            if not check_integrity(fpath, md5):
                return False
        return True

    def download(self) -> None:
        if self._check_integrity():
            print('Files already downloaded and verified')
            return
        download_and_extract_archive(self.url, self.root, filename=self.filename, md5=self.tgz_md5)

    def extra_repr(self) -> str:
        return "Split: {}".format("Train" if self.train is True else "Test")


# cifar100 = torchvision.datasets.CIFAR100("/data1/cifar100/", transform=transforms.ToTensor(), download=True)
cifar10 = CIFAR10("/data1/cifar10-byol-poisoned-cps/", transform=transforms.ToTensor(), download=False)
# /data1/cifar10-simclr-poisoned-cps/cifar-10-batches-py
device = "cuda:0"

Partition into approximate latent classes

In [7]:
from sas.approx_latent_classes import clip_approx
from sas.subset_dataset import SASSubsetDataset
import random 

# rand_labeled_examples_indices = random.sample(range(len(cifar100)), 500)
# rand_labeled_examples_labels = [cifar100[i][1] for i in rand_labeled_examples_indices]
rand_labeled_examples_indices = random.sample(range(len(cifar10)), 500)
rand_labeled_examples_labels = [cifar10[i][1] for i in rand_labeled_examples_indices]

partition = clip_approx(
    img_trainset=cifar10,
    # img_trainset=cifar100,
    labeled_example_indices=rand_labeled_examples_indices, 
    labeled_examples_labels=rand_labeled_examples_labels,
    num_classes=10,
    # num_classes=100,
    device=device
)

Load proxy model

In [8]:
from torch import nn 

class ProxyModel(nn.Module):
    def __init__(self, net, critic):
        super().__init__()
        self.net = net
        self.critic = critic
    def forward(self, x):
        return self.critic.project(self.net(x))

import torch 

net = torch.load("proxy-cifar100-resnet10-399-net.pt")
critic = torch.load("proxy-cifar100-resnet10-399-critic.pt")
proxy_model = ProxyModel(net, critic)

Get subset and save it to file

In [10]:
cl_method = 'byol'

for subset_fraction in (0.2, 0.4, 0.6, 0.8):
    filename = f"cifar10-{cl_method}-cps-poisoned-{subset_fraction}-sas-indices.pkl"
    
    subset_dataset = SASSubsetDataset(
        # dataset=cifar100,
        dataset=cifar10,
        subset_fraction=subset_fraction,
        # num_downstream_classes=100,
        num_downstream_classes=10,
        device=device,
        proxy_model=proxy_model,
        approx_latent_class_partition=partition,
        verbose=True
    )
    
    subset_dataset.save_to_file(filename)

Subset Selection:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.35s/it]

Subset Size: 10000
Discarded 40000 examples



Subset Selection:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.40s/it]

Subset Size: 20000
Discarded 30000 examples



Subset Selection:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.45s/it]

Subset Size: 30000
Discarded 20000 examples



Subset Selection:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.43s/it]

Subset Size: 40000
Discarded 10000 examples



