In [11]:
import json, os, random
from tqdm import tqdm
from PIL import Image

import torch
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

from pytorch_lightning import LightningDataModule
from transformers import GPT2Tokenizer, AutoFeatureExtractor
import utils

In [22]:
visual_backbone = "microsoft/swin-base-patch4-window7-224-in22k"
lm_backbone = "distilgpt2"
train_anno_dir = "/media/storage/coco/VQA-X/annotated/vqaX_val.json"
image_dir = "/media/storage/coco/"
fewshot_num = 1.0
mode = "teacher"
seed = 42
max_seq_length = 40


In [13]:
img_transform = AutoFeatureExtractor.from_pretrained(visual_backbone)
tokenizer = GPT2Tokenizer.from_pretrained(lm_backbone)
num_new_tokens = tokenizer.add_special_tokens({'pad_token': '<pad>','additional_special_tokens': ['<question>', '<answer>', '<explanation>']})

In [23]:
     
data = json.load(open(train_anno_dir, 'r'))
ids_list = list(data.keys())

for k,v in data.items():   
    if len(v['explanation']) > 1:   # some questions have more than one explanation
        # duplicate them for loading. -1 because one explanation is already in ids_list
        ids_list += [str(k)] * (len(v['explanation']) - 1)    

index_tracker = {k: len(v['explanation']) - 1 for k,v in data.items()}

quention_id = ids_list[0]
sample = data[quention_id]
img_name = sample['image_name']
text_a = utils.proc_ques(sample['question'])    # question
answer = utils.proc_ans(sample['answers'])

exp_idx = index_tracker[quention_id]    # the index of the explanation for questions with multiple explanations
if exp_idx > 0:
    index_tracker[quention_id] -= 1    # decrease usage
        
text_b = sample['explanation'][exp_idx]   # explanation

# tokenization process
q_segment_id, a_segment_id, e_segment_id = tokenizer.convert_tokens_to_ids(['<question>', 
                                                                                    '<answer>', 
                                                                                    '<explanation>'])
tokens = tokenizer.tokenize(text_a)
labels = [-100] * len(tokens)   # we dont want to predict the question, set to pad to ignore in XE
segment_ids = [q_segment_id] * len(tokens)

answer = [tokenizer.bos_token] + tokenizer.tokenize(" the answer is " + answer)
answer_len = len(answer)
tokens_b = tokenizer.tokenize(" because " + text_b) + [tokenizer.eos_token]
exp_len = len(tokens_b)
tokens += answer + tokens_b
labels += [-100] + answer[1:] + tokens_b   # labels will be shifted in the model, so for now set them same as tokens
segment_ids += [a_segment_id] * answer_len
segment_ids += [e_segment_id] * exp_len

if len(tokens) > max_seq_length :
    tokens = tokens[:max_seq_length]
    labels = labels[:max_seq_length]
    segment_ids = segment_ids[:max_seq_length]


assert len(tokens) == len(segment_ids) 
assert len(tokens) == len(labels)

seq_len = len(tokens)
padding_len = max_seq_length - seq_len
tokens = tokens + ([tokenizer.pad_token] * padding_len)
labels = labels + ([-100] * padding_len)

segment_ids += ([e_segment_id] * padding_len)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(input_ids, dtype=torch.long)

labels = [tokenizer.convert_tokens_to_ids(t) if t!=-100 else t for t in labels]
labels = torch.tensor(labels, dtype=torch.long)

segment_ids = torch.tensor(segment_ids, dtype=torch.long)
qid = torch.LongTensor([int(quention_id)])


In [42]:

test_anno_dir = "/media/storage/coco/VQA-X/annotated/vqaX_test.json"
data = json.load(open(test_anno_dir, 'r'))
ids_list = list(data.keys())
quention_id = ids_list[0]
sample = data[quention_id]
img_name = sample['image_name']
text_a = utils.proc_ques(sample['question'])    # question

# tokenization process
q_segment_id, a_segment_id, e_segment_id = tokenizer.convert_tokens_to_ids(['<question>', '<answer>', '<explanation>'])
tokens = tokenizer.tokenize(text_a)
segment_ids = [q_segment_id] * len(tokens)

answer = [tokenizer.bos_token] + tokenizer.tokenize(" the answer is")
answer_len = len(answer)
tokens += answer 

segment_ids += [a_segment_id] * answer_len

input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(input_ids, dtype=torch.long)
segment_ids = torch.tensor(segment_ids, dtype=torch.long)

In [43]:
tokenizer.decode(input_ids)

'what is this<|endoftext|> the answer is'

In [50]:
tokenizer.decode(segment_ids)

'<question> <question> <question> <answer> <answer> <answer> <answer>'

In [59]:
tokenizer.decode(lb_mine2)

' the answer is yes because the same snowboarder is present many times<|endoftext|>'

In [60]:
cached_filename = f"vqax_shot-500_teacher_pseudo_seed-42.cache"
datasets = torch.load(os.path.join("cached", cached_filename))

In [63]:
tokenizer.decode(datasets[0]["segment_ids"])

'<question> <question> <question> <answer> <answer> <answer> <answer> <answer> <answer> <explanation> <explanation> <explanation>'

In [64]:
tokenizer.decode(datasets[0]["input_ids"])

'what is this the answer is shower<|endoftext|> because '

In [57]:
lb_mine = [lb  for lb in datasets[0]["labels"] if lb != -100]
lb_mine2 = [lb  for lb in labels if lb != -100]

In [58]:
tokenizer.decode(lb_mine)

'<|endoftext|> because the same snowboarder is present many times<|endoftext|>'

In [9]:
tokenizer.convert_ids_to_tokens(220)

'Ġ'

In [10]:
tokenizer.convert_tokens_to_ids('Ġbecause')

780

In [37]:
anno = json.load(open(train_anno_dir, "r"))

In [5]:
tokenizer.bos_token

'<|endoftext|>'

In [38]:
dataset = {}
dataset["train"] = get_dataset(train_anno, mode="train")

NameError: name 'get_dataset' is not defined

In [39]:
cached_filename = f"vqax_shot-{fewshot_num}_{mode}_seed-{seed}.cache"

In [50]:
ids_list = list(anno.keys())
image_id = ids_list[0]
index_tracker = {k: len(v['explanation']) - 1 for k,v in anno.items()}
for k,v in anno.items():   
    if len(v['explanation']) > 1:   # some questions have more than one explanation 
        ids_list += [str(k)] * (len(v['explanation']) - 1) # duplicate them for loading. -1 because one explanation is already in ids_list

# Set image directory
img_dir = image_dir + "/val2014/"
datasets = []
question_id = ids_list[0]
sample = anno[question_id]
img_name = sample['image_name']

question_txt = utils.proc_ques(sample['question'])    # question
answer_txt = utils.proc_ans(sample['answers'])
exp_idx = index_tracker[question_id]
explain_txt = sample['explanation'][exp_idx]

In [51]:
if exp_idx > 0:
    index_tracker[question_id] -= 1    # decrease usage
# Image    
img_path = img_dir + img_name
img = img_transform(Image.open(img_path).convert("RGB"), return_tensors="pt").pixel_values

In [77]:

student_input = f"{question_txt}"
student_label = f"the answer is {answer_txt} because {explain_txt}"
# question: [Q] reason: [E] -> the answer is [A]
teacher_input = f"{question_txt} the answer is {answer_txt}"
teacher_label = f"because {explain_txt}"

In [53]:
q_segment_id, a_segment_id, e_segment_id = tokenizer.convert_tokens_to_ids(['<question>', '<answer>', '<explanation>'])

In [54]:
q_segment_id

50258

In [78]:
# student
student_input = tokenizer.tokenize(student_input)
labels2 = [-100] * len(student_input)   # we dont want to predict the question, set to pad to ignore in XE
segment_ids = [q_segment_id] * len(student_input)

answer = [tokenizer.bos_token] + tokenizer.tokenize(" the answer is " + answer_txt)
answer_len = len(answer)
explanation = tokenizer.tokenize(" because " + explain_txt) + [tokenizer.eos_token]
exp_len = len(explanation)
output = answer + explanation
student_input += output
labels2 += [-100] + output[1:]  # labels will be shifted in the model, so for now set them same as tokens
segment_ids += [a_segment_id] * answer_len
segment_ids += [e_segment_id] * exp_len

In [79]:
labels1 == labels2

True

In [66]:
# student
question = tokenizer.tokenize(f"{question_txt} ")
q_len = len(question)
answer = tokenizer.tokenize(f"the answer is {answer_txt}")
a_len = len(answer)

student_input = question + answer
labels = [-100] * (q_len + a_len)   # we dont want to predict the question, set to pad to ignore in XE
segment_ids = [q_segment_id] *q_len + [a_segment_id] * a_len

explanation = [tokenizer.bos_token] + tokenizer.tokenize(" because " + explain_txt) + [tokenizer.eos_token]
exp_len = len(explanation)


student_input += explanation
labels += [-100] + explanation[1:]   # labels will be shifted in the model, so for now set them same as tokens
segment_ids += [e_segment_id] * exp_len

In [82]:
datasets = []
stage = "student"

question_token = tokenizer.tokenize(f"{question_txt}")
q_len = len(question)
answer_token = tokenizer.tokenize(f"the answer is {answer_txt}")
a_len = len(answer)
explanation_token = [tokenizer.bos_token] + tokenizer.tokenize(" because " + explain_txt) + [tokenizer.eos_token]
exp_len = len(explanation)

if stage == "student":
    input = question_token
    answer = [tokenizer.bos_token] + answer_token
    explanation = explanation_token + [tokenizer.eos_token]
    segment_ids = [q_segment_id] * len(question) + [a_segment_id] * len(answer) + [e_segment_id] * len(explanation)
    
elif stage == "teacher":
    input = question_token + answer_token
    output = [tokenizer.bos_token] + explanation_token + [tokenizer.eos_token]
    segment_ids = [q_segment_id] * len(question) + [a_segment_id] * len(answer) + [e_segment_id] * len(output)
    
    
else:
    raise NotImplementedError

input += output
labels = [-100] * len(input) + [-100] + output[1:] # labels will be shifted in the model, so for now set them same as tokens


# # paddding
# seq_len = len(input)
# padding_len = max_seq_length - seq_len
# input = input + ([tokenizer.pad_token] * padding_len)
# labels = labels + ([-100] * padding_len)
# segment_ids += ([e_segment_id] * padding_len)

# token -> ids
input_ids = tokenizer.convert_tokens_to_ids(input)

input_ids = torch.tensor(input_ids, dtype=torch.long)

labels = [tokenizer.convert_tokens_to_ids(t) if t!=-100 else t for t in labels]
labels = torch.tensor(labels, dtype=torch.long)

segment_ids = torch.tensor(segment_ids, dtype=torch.long)

# image
if mode=="train":
    img_dir = image_dir + "/train2014/"
else:
    img_dir = image_dir + "val2014/"
img = Image.open(img_dir+img_name).convert('RGB')
img = img_transform(img)
qid = torch.LongTensor([int(image_id)])

datasets.append((input_ids, labels, segment_ids, img))    


In [None]:
elif mode == "adaptation":
    def collate_wrapper(batch):
        batch = list(zip(*batch))
        sample = {}
        # vis_rep_len = self.cfg.vis_rep_len
        input_max_len = max([x.size(0) for x in batch[0]])
        label_max_len  = max([x.size(0) for x in batch[1]])
        seg_max_len = max([x.size(0) for x in batch[2]])
        
        # input id & attention mask
        inputs_id = torch.zeros((len(batch[0]), input_max_len), dtype=torch.long)
        attn_mask = torch.zeros((len(batch[0]), input_max_len), dtype=torch.long)
        segment_id = torch.zeros((len(batch[2]), seg_max_len), dtype=torch.long)
        for i, x in enumerate(batch[0]):
            inputs_id[i,:x.size(0)] = x
            attn_mask[i,:x.size(0)] = 1.0
        
        # Segment id
        for i, x in enumerate(batch[2]):
            segment_id[i,:x.size(0)] = x
        
        # label
        label = torch.zeros((len(batch[1]), label_max_len), dtype=torch.long)
        for i, x in enumerate(batch[1]):
            label[i,:x.size(0)] = x

        sample["inputs_id"] = inputs_id
        sample["attn_mask"] = attn_mask
        sample["label"] = label
        sample["img"] = batch[3]

        return sample


In [83]:
datasets

[(tensor([22437,   428,  1656,   284,   307,   257,  4590,   286,  3294, 32185,
            286,   262,  2042, 38186,  6729,  3526,   263, 50256,   262,  3280,
            318,  3763,   780,   262,   976,  6729,  3526,   263,   318,  1944,
            867,  1661, 50256]),
  tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,   262,  3280,   318,  3763,   780,   262,
            976,  6729,  3526,   263,   318,  1944,   867,  1661, 50256]),
  tensor([50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
          50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50259, 50259,
          50259, 50259, 50259, 50260, 50260, 50260, 50260, 50260, 50260, 50260,
          50260, 50260, 50260, 50260, 50260, 50260]),
  {'pixel_values': [array([[[-0.919171

In [56]:
tokenizer.bos_token

'<|endoftext|>'

In [57]:
# paddding
seq_len = len(student_input)
padding_len = max_seq_length - seq_len
student_input = student_input + ([tokenizer.pad_token] * padding_len)
labels = labels + ([-100] * padding_len)
segment_ids += ([e_segment_id] * padding_len)

# token -> ids
input_ids = tokenizer.convert_tokens_to_ids(student_input)

input_ids = torch.tensor(input_ids, dtype=torch.long)

labels = [tokenizer.convert_tokens_to_ids(t) if t!=-100 else t for t in labels]
labels = torch.tensor(labels, dtype=torch.long)

segment_ids = torch.tensor(segment_ids, dtype=torch.long)

In [62]:
# image
if mode=="train":
    img_dir = image_dir + "/train2014/"
else:
    img_dir = image_dir + "val2014/"
img = Image.open(img_dir+img_name).convert('RGB')
img = img_transform(img)
qid = torch.LongTensor([int(image_id)])

In [73]:
def split_dataset(data_lst,num):
    return [data_lst[i: i+num] for i in range(0, len(data_lst), num)]

In [74]:
smp = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]

In [76]:
sample = split_dataset(smp,len(smp)//3)
sample

[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16]]

In [2]:
import json
all_file = json.load(open("/media/storage/coco/VQA-X/annotated/vqaX_test.json", 'r'))

In [8]:
all_file["262284001"]

{'question': 'What is this?',
 'answers': [{'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 1},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 2},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 3},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 4},
  {'answer': 'shower', 'answer_confidence': 'maybe', 'answer_id': 5},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 6},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 7},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 8},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 9},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 10}],
 'image_id': '262284',
 'image_name': 'COCO_val2014_000000262284.jpg',
 'explanation': ['it has a shower head hanging inside of it',
  'there is a shower head',
  'there is a faucet and a bathtub']}

In [10]:
asdf = ["123414"]
a = [asdf]
a

[['123414']]

In [1]:
a  = set([1,2,3,4,4,5])
b= set([3,4,4,5,6])
a&b

{3, 4, 5}

In [None]:
question_token = self.tokenizer.tokenize(f"{question}")
answer_token = self.tokenizer.tokenize(answer)
explanation_token = [self.tokenizer.bos_token] + self.tokenizer.tokenize(f" because")
input = question_token + answer_token + explanation_token

In [3]:
import torch
import numpy as np
sample = np.array("asdfasdfdasfsdfsdfdasfa")
torch.Tensor(sample)

TypeError: can't convert np.ndarray of type numpy.str_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.