In [31]:
from transformers import CLIPTextModel, CLIPTokenizer, CLIPFeatureExtractor


PRETRAINED_PATH = 'pretrained/models--CompVis--stable-diffusion-v1-4'

tokenizer = CLIPTokenizer.from_pretrained(PRETRAINED_PATH, subfolder='tokenizer')
text_encoder = CLIPTextModel.from_pretrained(PRETRAINED_PATH, subfolder='text_encoder')

tokens = tokenizer('A spine-chillingly terrifying landscape never before seen by mankind.',
			padding="max_length",
			max_length=tokenizer.model_max_length,
			return_tensors="pt",
		)
print(tokens.input_ids, tokens.attention_mask)

embeddings = text_encoder(tokens.input_ids)
embeddings.last_hidden_state.shape, embeddings.pooler_output.shape


tensor([[49406,   320, 19646,   268,  6498,  4796, 18526,  5727,  1426,  1348,
          2041,   638, 24155,   269, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407]]) tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]])


(torch.Size([1, 77, 768]), torch.Size([1, 768]))

In [36]:
tokenizer([tokenizer.bos_token, tokenizer.eos_token, 'a'])

{'input_ids': [[49406, 49406, 49407], [49406, 49407, 49407], [49406, 320, 49407]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1]]}

In [18]:
import torch


VOCAB_SIZE = 49408

for p in text_encoder.text_model.embeddings.parameters():
	p.requires_grad = False

unembed = torch.nn.Linear(768, VOCAB_SIZE)

mask = None #1 - torch.triu(torch.ones(1, tokenizer.model_max_length, tokenizer.model_max_length))
embeddings = text_encoder(tokens.input_ids, attention_mask=mask)

output_ids = unembed(embeddings.last_hidden_state)
o2 = output_ids.permute(0, 2, 1)

loss = torch.nn.functional.cross_entropy(o2, tokens.input_ids)
loss.backward()

In [16]:
m = 1 - torch.triu(torch.ones(1, 5, 5))
m

tensor([[[0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 0.]]])

In [1]:

import os

import starry.utils.config
from starry.text.data.sentences import SentenceShift



DATA_DIR = os.environ.get('DATA_DIR')
CLIP_PATH = 'pretrained/models--CompVis--stable-diffusion-v1-4'

data, = SentenceShift.load(os.path.join(DATA_DIR, 'mj-desc.txt'), {'tokenizer_path': CLIP_PATH}, '0/1')
data.entries

  from .autonotebook import tqdm as notebook_tqdm


[{'input_ids': tensor([[49406,   274,   270,   275,  5352,   539,   320,  2308,   593,  2401,
           1774,  4271, 20451, 14900,   267,  1746,   537,  1579,   267,   638,
          46870,  4959,  2658,   267,  5220,  6781, 34617, 49407, 49407, 49407,
          49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
          49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
          49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
          49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
          49407, 49407, 49407, 49407, 49407, 49407, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]]), 'output_ids': tensor([[  274,   270,   275,  5352,   539,   320,  2308,  