In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel

In [3]:
import timm
import torch

In [3]:
class FrozenModel(torch.nn.Module):
    def __init__(self, vision_model, nlp_model):
        super().__init__()
        self.lm = nlp_model
        # self.lm.requires_grad=False
        for name, param in self.lm.named_parameters():
            # if name.split('.')[0] == "lm":
            param.requires_grad = False
        self.v_encoder = vision_model
        
    
    def forward(self, img, tokens, **kwargs):
        vis_embed = self.v_encoder(img)
        vis_embed_shape = vis_embed.size()
        vis_embed = vis_embed.reshape([vis_embed_shape[0], 2, int(vis_embed_shape[-1]/2)])
        # print(vis_embed)
        
        input_ids = tokens["input_ids"]
        
        if "Model" in type(self.lm).__name__ and "Head" not in type(self.lm).__name__:
            nlp_embed = self.lm.wte(input_ids)
        else:
            nlp_embed = self.lm.transformer.wte(input_ids)

        inputs = {k: v for k,v in tokens.items() if k != "input_ids"}
        inputs["inputs_embeds"] = torch.cat([vis_embed, nlp_embed], 1)
        inputs["attention_mask"] = torch.cat([torch.ones(vis_embed_shape[0], 2), tokens["attention_mask"]], 1)

        lm_output = self.lm(**inputs, **kwargs)
        
        return lm_output
    
    # def backward(self, **kwargs):
    #     super().backward(**kwargs)
    #     for name, param in frozen.named_parameters():
    #         if name.split('.')[0] == "lm":
    #             param.grad = None
    
    @classmethod
    def from_pretrained(cls, hface_path: str, pretrained_vision: bool=False):
        lm_config = AutoConfig.from_pretrained(hface_path)
        
        vision = timm.create_model('nf_resnet50', pretrained=pretrained_vision)
        vision.head.fc = torch.nn.Linear(2048, lm_config.n_embd*2) # for prefix embedding
        
        lm = AutoModelForCausalLM.from_pretrained(hface_path)
        return cls(vision, lm)
    
    @classmethod
    def from_trained(cls, path: str):
        pass

In [9]:
tok.eos_token

'<|endoftext|>'

In [37]:
dummy_dict = {
    "pad_token": '<|endoftext|>',
    "dummy": False
}

In [39]:
tok = AutoTokenizer.from_pretrained("gpt2", **dummy_dict)
# tok.pad_token=tok.eos_token
config = AutoConfig.from_pretrained("gpt2")

In [8]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [9]:
output1 = model(**encoded_input)

In [10]:
output1.last_hidden_state

tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2708, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward>)

In [16]:
encoded_input.pop("token_type_ids")
output2 = model(**encoded_input)

KeyError: 'token_type_ids'

In [17]:
output1.last_hidden_state

tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2708, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward>)

In [18]:
output2.last_hidden_state

tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2708, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward>)

In [15]:
# encoded_input

In [40]:
tok.pad_token

'<|endoftext|>'

In [26]:
b = 2
mok_img = torch.rand(b, 3, 256, 256)
mok_tokens = tok(["Hello", "My name is Kevin"], return_tensors='pt', padding="max_length")
# mok_tokens.pop("token_type_ids")

In [27]:
mok_tokens

{'input_ids': tensor([[15496, 50256, 50256,  ..., 50256, 50256, 50256],
        [ 3666,  1438,   318,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 0, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [7]:
mok_tokens.input_ids.size()

torch.Size([2, 4])

In [8]:
m = timm.create_model('nf_resnet50', pretrained=True)

In [9]:
m(mok_img).size()

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


torch.Size([2, 1000])

In [10]:
frozen = FrozenModel.from_pretrained("gpt2")
# frozen.train()

In [11]:
frozen.lm.transformer.h[1].ln_1.bias.requires_grad

False

In [12]:
output = frozen(mok_img, mok_tokens)

In [13]:
# output

In [14]:
loss = torch.nn.CrossEntropyLoss()
target = torch.cat([torch.ones(2,1) * tok.eos_token_id, mok_tokens.input_ids, torch.ones(2,1) * tok.eos_token_id], -1)

In [15]:
loss_val = loss(output.logits.transpose(-1,-2), target.to(torch.long))

In [16]:
loss_val

tensor(10.3180, grad_fn=<NllLoss2DBackward>)

In [17]:
loss_val.backward()

In [None]:
for name, param in frozen.named_parameters():
    print(name, param.grad)

In [27]:
torch.ones(2) * tok.eos_token_id

tensor([50256., 50256.])

In [58]:
a = torch.rand(2, 5, 3)
b = torch.rand(2, 6, 3)

torch.cat([a,b], 1).size()

torch.Size([2, 11, 3])

In [50]:
tok(["Hello", "My name is Kevin"])

{'input_ids': [[15496], [3666, 1438, 318, 7939]], 'attention_mask': [[1], [1, 1, 1, 1]]}

In [51]:
mask = tok(["Hello", "My name is Kevin"] ,return_tensors='pt', padding=True).attention_mask

In [52]:
mask

tensor([[1, 0, 0, 0],
        [1, 1, 1, 1]])

In [11]:
import torch
m.head.fc = torch.nn.Linear(2048, 1024*2)

# (head): ClassifierHead(
#     (global_pool): SelectAdaptivePool2d (pool_type=avg, flatten=Flatten(start_dim=1, end_dim=-1))
#     (fc): Linear(in_features=2048, out_features=1000, bias=True)
#     (flatten): Identity()
#   )

In [None]:
m

In [19]:
!pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Looking in indexes: https://pypi.org/simple, http://kakaobrain-pypi.dev.9rum.cc/
Collecting pytorch_lightning==1.1.4
  Downloading pytorch_lightning-1.1.4-py3-none-any.whl (684 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m684.0/684.0 KB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.2.1
  Downloading transformers-4.2.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm==4.56.0
  Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 KB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
