In [4]:
from torchvision import transforms
#from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
import torchvision.models as models


In [None]:
class CHEX_Dataset(torch.utils.data.Dataset):
    def __init__(self, image_names, labels):
        self.image_names = image_names
        self.transform = transforms.Compose(
            [
                transforms.Resize((256, 256)),
                transforms.ToTensor(),
            ]
        )
        self.labels = labels


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image_name, label = self.image_names[idx], self.labels[idx]
        image_path = '/chexpert_dataset/train' + image_name
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)
        return image, label

In [None]:
train_jpgs = get_image_name(train_set)

In [None]:
train_dataset = VOC_Dataset(train_jpgs, multi_hot)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Use the torchvision's implementation of ResNeXt, but add FC layer for a different number of classes (27) and a Sigmoid instead of a default Softmax.
class Resnext50(nn.Module):
    def __init__(self, embed_dim=512):
        super().__init__()
        resnet = models.resnext50_32x4d(pretrained=True)
        self.backbone = nn.Sequential(*list(resnet.children())[:-1])
        in_dim = resnet.fc.in_features
        self.proj = nn.Linear(in_dim, embed_dim)
 
    def forward(self, x):
        features = self.backbone(x)
        features=features.squeeze(-1).squeeze(-1)
        z = self.proj(features)
        # convert to unit vectors for cosine similarity later
        z = z / z.norm(dim=-1, keepdim=True)
        return z


In [None]:
 
# Initialize the model
image_encoder = Resnext50()
# Switch model to the training mode
image_encoder.train()

### Contrastive Loss

In [None]:
# image_encoder - ResNet or Vision Transformer
# text_encoder - CBOW or Text Transformer
# I[n, h, w, c] - minibatch of aligned images
# T[n, l] - minibatch of aligned texts
# W_i[d_i, d_e] - learned proj of image to embed
# W_t[d_t, d_e] - learned proj of text to embed
# t - learned temperature parameter
# extract feature representations of each modality
I_f = image_encoder(I) #[n, d_i]
T_f = text_encoder(T) #[n, d_t]
# joint multimodal embedding [n, d_e]
I_e = l2_normalize(np.dot(I_f, W_i), axis=1)
T_e = l2_normalize(np.dot(T_f, W_t), axis=1)
# scaled pairwise cosine similarities [n, n]
logits = np.dot(I_e, T_e.T) * np.exp(t)
# symmetric loss function
labels = np.arange(n)
loss_i = cross_entropy_loss(logits, labels, axis=0)
loss_t = cross_entropy_loss(logits, labels, axis=1)
loss = (loss_i + loss_t)/2

### GAN

In [None]:
class TextCondGenerator(nn.Module):
    def __init__(self, z_dim=128, t_dim=256, img_size=256):
        super().__init__()
        self.fc = nn.Linear(z_dim + t_dim, 512 * 4 * 4)
        # then deconv / upsampling blocks to get [1,H,W]
        self.main = nn.Sequential(
            nn.ConvTranspose2d(512, 256, 4, 2, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.ConvTranspose2d(256, 128, 4, 2, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, 4, 2, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 1, 4, 2, 1),
            nn.Tanh()     # outputs in [-1,1]
        )

    def forward(self, z_noise, z_text):
        z = torch.cat([z_noise, z_text], dim=-1)
        x = self.fc(z)
        x = x.view(x.size(0), 512, 4, 4)
        img = self.main(x)
        return img
