In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Jun 24 09:27:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Vision Module & Embedding Generation

In [2]:
import torch
#import egg.core as core
from torchvision import models, datasets, transforms
import random
import os.path
from torch.utils.data import Dataset
from typing import Type, Any, Callable, Union, List, Dict, Optional, cast
from collections import OrderedDict 
from torchvision.models.resnet import *
from torchvision.models.resnet import BasicBlock, Bottleneck
from torchvision.models.resnet import model_urls
from torch.hub import load_state_dict_from_url

#import psutil

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Vision Module & Image Dataset

In [4]:
# First, the pretrained resnet18 is loaded
rn18 = models.resnet18(pretrained=True)

# Since the output of the resnet 18 model is an almost one-hot encoding, it makes more sense to use an
# intermediate representation. I will use the output of the second to last layer, i.e. the output of the avgpool layer.

# In order to get that desired output, we need to define a new model which is basically a copy of the
# pre trained resnet18 with only the last layer missing. I do this mainly based on https://medium.com/the-owl/extracting-features-from-an-intermediate-layer-of-a-pretrained-model-in-pytorch-easy-way-62631c7fa8f6
class IntResNet(ResNet):
    def __init__(self,output_layer,*args):
        self.output_layer = output_layer
        super().__init__(*args)
        
        self._layers = []
        for l in list(self._modules.keys()):
            self._layers.append(l)
            if l == output_layer:
                break
        self.layers = OrderedDict(zip(self._layers,[getattr(self,l) for l in self._layers]))

    def _forward_impl(self, x):
        for l in self._layers:
            x = self.layers[l](x)

        return x

    def forward(self, x):
        return self._forward_impl(x)

def new_resnet(
    arch: str,
    outlayer: str,
    block: Type[Union[BasicBlock, Bottleneck]],
    layers: List[int],
    pretrained: bool,
    progress: bool,
    **kwargs: Any
) -> IntResNet:

    '''model_urls = {
        'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
        'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
        'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
        'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
        'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
        'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
        'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
        'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
        'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
    }'''

    model = IntResNet(outlayer, block, layers, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [5]:
# Second, the vision module is defined as all the layers of the pre trained resnet18 model except the last one
vision_module = new_resnet('resnet18', 'avgpool', BasicBlock, [2, 2, 2, 2],True,True)

# Lastly, the vision module is put into evaluation mode and assigned to the device in use
vision_module.eval()
vision_module.to(device)

IntResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)

In [6]:
imagenet_train_data = datasets.ImageNet(root='/content/drive/My Drive/Thesis_Data/ILSVRC2012', split = 'train', transform = transforms.ToTensor())
#imagenet_val_data = datasets.ImageNet(root='/content/drive/My Drive/Thesis_Data/ILSVRC2012', split = 'val', transform = transforms.ToTensor())

In [7]:
train_data_len = int(len(imagenet_train_data)*0.6)
test_and_zero_shot_data_len = len(imagenet_train_data) - train_data_len
train_data, test_and_zero_shot_data = torch.utils.data.random_split(imagenet_train_data, [train_data_len, test_and_zero_shot_data_len], generator=torch.Generator().manual_seed(42))
test_data_len = int(test_and_zero_shot_data_len * 0.5)
zero_shot_data_len = test_and_zero_shot_data_len - test_data_len
test_data, zero_shot_data = torch.utils.data.random_split(test_and_zero_shot_data, [test_data_len, zero_shot_data_len], generator=torch.Generator().manual_seed(42))

## Produce & Store Embeddings
Since my computational ressources are limited, the image embeddings that are fed to the sender/receiver are not computed in inference time but ahead of training.

In [8]:
def produce_embeddings(dataset, embeddings_path, start_idx, end_idx):
    for idx in range(start_idx, end_idx):
        img, label = dataset[idx]

        embedding = vision_module(img[None, ...].to(device))
        
        embedding_string = str(embedding.tolist()).replace(',', '').replace('[', '').replace(']', '')
        
        with open(embeddings_path, 'a') as txt_file:
            txt_file.write(embedding_string + ' , ' + str(label) + ' , ' + str(idx) + '\n')

In [9]:
#produce_embeddings(dataset = imagenet_val_data, embeddings_path = '/content/drive/My Drive/Thesis_Data/Embeddings/val.txt', start_idx = 0, end_idx = len(imagenet_val_data))

In [13]:
produce_embeddings(dataset = train_data, embeddings_path = '/content/drive/My Drive/Thesis_Data/Embeddings/train.txt', start_idx = 5000, end_idx = 10000)



In [11]:
#produce_embeddings(dataset = test_data, embeddings_path = '/content/drive/My Drive/Thesis_Data/Embeddings/test.txt', start_idx = 0, end_idx = 10000)

In [12]:
produce_embeddings(dataset = zero_shot_data, embeddings_path = '/content/drive/My Drive/Thesis_Data/Embeddings/zero_shot.txt', start_idx = 0, end_idx = 10000)

# Options
1st: embedd all train images -> have txt file in form embeddings, label -> go through lines of txt file: if label to be exlcuded -> produce write in zero shot txt; if

2nd: split imagenet train 60/40 (720,000/480,000) and split 40% 50/50 (240,000 each) (use generator for reproducibility between colab sessions) -> embedd all 3 train subsets (form embedding, idx, label) -> to produce egg train inputs: go through biggest one; for egg test and zero-shot go through smaller ones (when encountering img from class to be excluded while producing train or test: write input to zero-shot embedding file and repeat iteration; save idx to not write one embedding several times)