In [7]:
import torch
from transformers import BertTokenizer
from transformers import VisualBertForQuestionAnswering

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


encoder = VisualBertForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa')
encoder = encoder.visual_bert

print(encoder)

inputs = tokenizer("what is the defective tissue?", return_tensors="pt", padding="max_length", max_length=15)
labels = tokenizer("it is kidney", return_tensors="pt").input_ids

visual_embeds = torch.rand(1, 1, 2048)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

# print(visual_embeds.shape)
# print(visual_token_type_ids.shape)
# print(visual_token_type_ids.shape)

inputs.update({
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask
})

print(inputs)
# inputs["attention_mask"] = None

outputs = encoder(**inputs)

from transformers.models.bert.modeling_bert import BertLMPredictionHead
predict = BertLMPredictionHead(encoder.config)
outputs = predict(outputs['last_hidden_state'])
print(outputs.shape)

VisualBertModel(
  (embeddings): VisualBertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (visual_token_type_embeddings): Embedding(2, 768)
    (visual_position_embeddings): Embedding(512, 768)
    (visual_projection): Linear(in_features=2048, out_features=768, bias=True)
  )
  (encoder): VisualBertEncoder(
    (layer): ModuleList(
      (0): VisualBertLayer(
        (attention): VisualBertAttention(
          (self): VisualBertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): VisualBer

torch.Size([1, 16, 30522])


In [1]:
import os
import glob
from PIL import Image

from torch.utils.data import Dataset
import torchvision.transforms as transforms


class SurgicalSentenceVQADataset(Dataset):
    def __init__(self, seq, folder_head, folder_tail, transform=None):
        
        self.transform = transform
        
        # files, question and answers
        filenames = []
        for curr_seq in seq: filenames = filenames + glob.glob(folder_head + str(curr_seq) + folder_tail)
        self.vqas = []
        for file in filenames:
            file_data = open(file, "r")
            lines = [line.strip("\n") for line in file_data if line != "\n"]
            file_data.close()
            for line in lines: self.vqas.append([file, line])
        print('Total files: %d | Total question: %.d' %(len(filenames), len(self.vqas)))
        
    def __len__(self):
        return len(self.vqas)

    def __getitem__(self, idx):
        
        # img
        loc = self.vqas[idx][0].split('/')
        img_loc = os.path.join(loc[0],loc[1],loc[2], 'left_frames',loc[-1].split('_')[0]+'.png')
        img = Image.open(img_loc)
        if self.transform: img = self.transform(img)
            
        # question and answer
        question = self.vqas[idx][1].split('|')[0]
        label = self.vqas[idx][1].split('|')[1]

        return img, question, label

In [2]:
import torch
import torch.nn as nn
from torchvision import models
from transformers import VisualBertForQuestionAnswering
from transformers.models.bert.modeling_bert import BertLMPredictionHead

class Surgical_VQA(nn.Module):
    def __init__(self, num_classes=12):
        super(Surgical_VQA, self).__init__()
        
        # visual feature extraction
        self.img_feature_extractor = models.resnet50(pretrained=True)
        new_fc = nn.Sequential(*list(self.img_feature_extractor.fc.children())[:-1])
        self.img_feature_extractor.fc = new_fc
        
        # visual + caption feature extractor
        vis_transformer = VisualBertForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa')
        self.transformer_encoder = vis_transformer.visual_bert

        #classifier
        self.predict_sentence = BertLMPredictionHead(self.transformer_encoder.config)

    def forward(self, img, inputs):
        
        visual_embeds = self.img_feature_extractor(img)
        visual_embeds = torch.unsqueeze(visual_embeds, 1)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long).cuda()
        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float).cuda()

        inputs.update({
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask
        })
        
        inputs['input_ids'] = inputs['input_ids'].cuda()
        inputs['token_type_ids'] = inputs['token_type_ids'].cuda()
        inputs['attention_mask'] = inputs['attention_mask'].cuda()
        inputs['visual_token_type_ids'] = inputs['visual_token_type_ids'].cuda()
        inputs['visual_attention_mask'] = inputs['visual_attention_mask'].cuda()
        
        out = self.transformer_encoder(**inputs)
        out = self.predict_sentence(out['last_hidden_state'])

        return out

In [3]:
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score

def calc_acc(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    return acc

def calc_classwise_acc(y_true, y_pred):
    matrix = confusion_matrix(y_true, y_pred)
    classwise_acc = matrix.diagonal()/matrix.sum(axis=1)
    return classwise_acc

def calc_map(y_true, y_scores):
    mAP = average_precision_score(y_true, y_scores,average=None)
    return mAP

In [4]:
import torch.nn.functional as F

def test_model(epoch, model, valid_dataloader, tokenizer):
    
    model.eval()

    total_loss = 0.0    
    label_true = None
    label_pred = None
    label_score = None
    
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for i, (imgs, q, l) in enumerate(valid_dataloader, 0):
            questions = []
            labels = []
            for question in q: questions.append(question)
            for label in l: labels.append(label)
            
            inputs = tokenizer(questions, return_tensors="pt", truncation=True, padding="max_length", max_length=15)
            GT_labels = tokenizer(labels, return_tensors="pt", truncation=True, padding="max_length", max_length=16).input_ids
            
            imgs, GT_labels = imgs.cuda(), GT_labels.cuda()
            
            outputs = model(imgs, inputs)

            loss = criterion(torch.flatten(outputs, start_dim=0, end_dim = 1), torch.flatten(GT_labels))
            total_loss += loss.item()
        
            scores, predicted = torch.max(F.softmax(torch.flatten(outputs, start_dim=0, end_dim = 1), dim=1).data, 1)    
            label_true = torch.flatten(GT_labels).data.cpu() if label_true == None else torch.cat((label_true, torch.flatten(GT_labels).data.cpu()), 0)
            label_pred = predicted.data.cpu() if label_pred == None else torch.cat((label_pred, predicted.data.cpu()), 0)
            label_score = scores.data.cpu() if label_score == None else torch.cat((label_score, scores.data.cpu()), 0)

            
    acc, c_acc, mAP = calc_acc(label_true, label_pred), calc_classwise_acc(label_true, label_pred), 0.0#calc_map(label_true, label_score)

    print('Test: epoch: %d loss: %.6f | Acc: %.6f | mAP: %.6f' %(epoch, total_loss, acc, mAP))
#     print(c_acc)
    
    return (acc, c_acc, mAP)

In [5]:
from torch import optim
def train_model(epoch, model, train_dataloader, lr, tokenizer):  # train model
    
    model.train()
    
    total_loss = 0.0    
    label_true = None
    label_pred = None
    label_score = None
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = lr, weight_decay = 0)
    
    for i, (imgs, q, l) in enumerate(train_dataloader,0):
        questions = []
        labels = []
        for question in q: questions.append(question)
        for label in l: labels.append(label)
            
        inputs = tokenizer(questions, return_tensors="pt", truncation=True, padding="max_length", max_length=15)
        GT_labels = tokenizer(labels, return_tensors="pt", truncation=True, padding="max_length", max_length=16).input_ids
            
        imgs, GT_labels = imgs.cuda(), GT_labels.cuda()
        
        optimizer.zero_grad()
            
        outputs = model(imgs, inputs)
        
#         print(outputs.shape, GT_labels.shape)
        # zero the parameter gradients

        loss = criterion(torch.flatten(outputs, start_dim=0, end_dim = 1), torch.flatten(GT_labels))
#         loss = criterion(outputs, GT_labels)
        loss.backward()
        optimizer.step()

        # print statistics
        total_loss += loss.item()
        
        scores, predicted = torch.max(F.softmax(torch.flatten(outputs, start_dim=0, end_dim = 1), dim=1).data, 1)    
        label_true = torch.flatten(GT_labels).data.cpu() if label_true == None else torch.cat((label_true, torch.flatten(GT_labels).data.cpu()), 0)
        label_pred = predicted.data.cpu() if label_pred == None else torch.cat((label_pred, predicted.data.cpu()), 0)
        label_score = scores.data.cpu() if label_score == None else torch.cat((label_score, scores.data.cpu()), 0)

    
    # loss and acc
    acc, c_acc, mAP = calc_acc(label_true, label_pred), calc_classwise_acc(label_true, label_pred), 0.0#calc_map(label_true, label_score)

    print('Train: epoch: %d loss: %.6f | Acc: %.6f | mAP: %.6f' %(epoch, total_loss, acc, mAP))
    return

In [6]:
import os
import torch

from torchvision import transforms
from torch.utils.data import DataLoader    
from transformers import BertTokenizer

os.environ["CUDA_VISIBLE_DEVICES"]="2"

def seed_everything(seed=27):
    '''
    Set random seed for reproducible experiments
    Inputs: seed number 
    '''
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
if __name__ == "__main__":
     
    # Set random seed
    seed_everything()  
    
    # Device Count
    num_gpu = torch.cuda.device_count()
    
    # hyperparameters
    bs = 20
    epochs = 150
    lr = 0.00001
    
    checkpoint_dir = 'checkpoints/v5/complex/'
    
    # train and test dataloader
    train_seq = [2]#, 3, 4, 6, 7, 9, 10, 11, 12, 14, 15]
    val_seq = [1, 5, 16]
    folder_head = 'dataset/instruments18/seq_'
    folder_tail = '/vqa/complex/*.txt'

    labels = ['kidney',
          'Idle', 'Grasping', 'Retraction', 'Tissue_Manipulation',
          'Tool_Manipulation', 'Cutting', 'Cauterization', 'Suction', 
          'Looping', 'Suturing', 'Clipping', 'Staple', 'Ultrasound_Sensing',
          'left-top', 'right-top', 'left-bottom', 'right-bottom']

    transform = transforms.Compose([
                transforms.Resize((300,256)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
                ])

    # train_dataset
    train_dataset = SurgicalSentenceVQADataset(train_seq, folder_head, folder_tail, transform=transform)
    train_dataloader = DataLoader(dataset=train_dataset, batch_size= bs, shuffle=True)

    # Val_dataset
    val_dataset = SurgicalSentenceVQADataset(val_seq, folder_head, folder_tail, transform=transform)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size= bs, shuffle=False)
    
    # model
    model = Surgical_VQA(num_classes=len(labels)).cuda()
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    best_epoch = [0]
    best_results = [0.0]
    
    for epoch in range(1, epochs):
        train_model(epoch, model, train_dataloader, lr, tokenizer )
        test_acc, test_c_acc, mAP = test_model(epoch, model, train_dataloader, tokenizer )
    
        if test_acc >= best_results[0]:
            best_results[0] = test_acc
            best_epoch[0] = epoch
        
        print('Best epoch: %d | Best acc: %.6f' %(best_epoch[0], best_results[0]))
        checkpoint = {'lr': lr, 'b_s': bs, 'state_dict': model.state_dict() }
        save_name = "checkpoint_" + str(epoch) + '_epoch.pth'
        
        torch.save(checkpoint, os.path.join(checkpoint_dir, save_name))

Total files: 137 | Total question: 1046
Total files: 447 | Total question: 3216


  classwise_acc = matrix.diagonal()/matrix.sum(axis=1)


Train: epoch: 1 loss: 415.087739 | Acc: 0.221080 | mAP: 0.000000
Test: epoch: 1 loss: 284.009027 | Acc: 0.401231 | mAP: 0.000000
Best epoch: 1 | Best acc: 0.401231
Train: epoch: 2 loss: 238.786792 | Acc: 0.593212 | mAP: 0.000000
Test: epoch: 2 loss: 137.994969 | Acc: 0.795351 | mAP: 0.000000
Best epoch: 2 | Best acc: 0.795351
Train: epoch: 3 loss: 130.141966 | Acc: 0.795411 | mAP: 0.000000
Test: epoch: 3 loss: 68.830898 | Acc: 0.868786 | mAP: 0.000000
Best epoch: 3 | Best acc: 0.868786
Train: epoch: 4 loss: 71.501628 | Acc: 0.873865 | mAP: 0.000000
Test: epoch: 4 loss: 37.233835 | Acc: 0.904159 | mAP: 0.000000
Best epoch: 4 | Best acc: 0.904159
Train: epoch: 5 loss: 41.238166 | Acc: 0.913838 | mAP: 0.000000
Test: epoch: 5 loss: 22.294852 | Acc: 0.937082 | mAP: 0.000000
Best epoch: 5 | Best acc: 0.937082
Train: epoch: 6 loss: 26.251845 | Acc: 0.935050 | mAP: 0.000000
Test: epoch: 6 loss: 14.522917 | Acc: 0.945686 | mAP: 0.000000
Best epoch: 6 | Best acc: 0.945686
Train: epoch: 7 loss: 1

Test: epoch: 51 loss: 0.222030 | Acc: 0.998805 | mAP: 0.000000
Best epoch: 51 | Best acc: 0.998805
Train: epoch: 52 loss: 0.655877 | Acc: 0.997371 | mAP: 0.000000
Test: epoch: 52 loss: 0.333580 | Acc: 0.998446 | mAP: 0.000000
Best epoch: 51 | Best acc: 0.998805
Train: epoch: 53 loss: 1.123773 | Acc: 0.995698 | mAP: 0.000000
Test: epoch: 53 loss: 0.972235 | Acc: 0.997909 | mAP: 0.000000
Best epoch: 51 | Best acc: 0.998805
Train: epoch: 54 loss: 0.764901 | Acc: 0.997311 | mAP: 0.000000
Test: epoch: 54 loss: 0.215033 | Acc: 0.998865 | mAP: 0.000000
Best epoch: 54 | Best acc: 0.998865
Train: epoch: 55 loss: 0.446330 | Acc: 0.997670 | mAP: 0.000000
Test: epoch: 55 loss: 0.226822 | Acc: 0.998924 | mAP: 0.000000
Best epoch: 55 | Best acc: 0.998924
Train: epoch: 56 loss: 1.082994 | Acc: 0.997012 | mAP: 0.000000
Test: epoch: 56 loss: 0.221004 | Acc: 0.998805 | mAP: 0.000000
Best epoch: 55 | Best acc: 0.998924
Train: epoch: 57 loss: 0.449468 | Acc: 0.998088 | mAP: 0.000000
Test: epoch: 57 loss: 

Train: epoch: 102 loss: 0.180317 | Acc: 0.999223 | mAP: 0.000000
Test: epoch: 102 loss: 0.126083 | Acc: 0.999701 | mAP: 0.000000
Best epoch: 99 | Best acc: 1.000000
Train: epoch: 103 loss: 0.119196 | Acc: 0.999641 | mAP: 0.000000
Test: epoch: 103 loss: 0.000475 | Acc: 1.000000 | mAP: 0.000000
Best epoch: 103 | Best acc: 1.000000
Train: epoch: 104 loss: 0.202668 | Acc: 0.999522 | mAP: 0.000000
Test: epoch: 104 loss: 0.060300 | Acc: 0.999641 | mAP: 0.000000
Best epoch: 103 | Best acc: 1.000000
Train: epoch: 105 loss: 0.108226 | Acc: 0.999283 | mAP: 0.000000
Test: epoch: 105 loss: 0.001481 | Acc: 1.000000 | mAP: 0.000000
Best epoch: 105 | Best acc: 1.000000
Train: epoch: 106 loss: 0.077028 | Acc: 0.999641 | mAP: 0.000000
Test: epoch: 106 loss: 0.002415 | Acc: 1.000000 | mAP: 0.000000
Best epoch: 106 | Best acc: 1.000000
Train: epoch: 107 loss: 0.466938 | Acc: 0.999283 | mAP: 0.000000
Test: epoch: 107 loss: 0.000986 | Acc: 1.000000 | mAP: 0.000000
Best epoch: 107 | Best acc: 1.000000
Train