In [1]:
# dataset
import os
import matplotlib.pyplot as plt
import numpy as np

from starry.utils.config import Configuration
from starry.utils.dataset_factory import loadDataset


config = Configuration.create('configs/invwordembed-test.yaml', volatile=True)
data, = loadDataset(config, data_dir='./', splits='*0/1')

it = iter(data)
next(it)


  from .autonotebook import tqdm as notebook_tqdm


tensor([ 6832, 35607, 44775, 48909])

In [1]:
# model test with loss
import os
import matplotlib.pyplot as plt
import numpy as np
import torch

from starry.utils.config import Configuration
from starry.utils.dataset_factory import loadDataset
from starry.utils.model_factory import loadModel


DATA_DIR = os.getenv('DATA_DIR')

config = Configuration.createOrLoad('configs/invwordembed-test.yaml', volatile=True)
data, = loadDataset(config, data_dir=DATA_DIR, splits='*0/1')

model = loadModel(config['model'], postfix='Loss')

it = iter(data)

id = next(it)
pred = model(id)
print(id, pred)


  from .autonotebook import tqdm as notebook_tqdm


id1: tensor([40499,  8459, 47154, 26762]) tensor([11544, 31320, 35455, 36775])
tensor([40499,  8459, 47154, 26762]) (tensor(10.7986, grad_fn=<NllLossBackward0>), {'acc': tensor(0.)})


In [5]:
# test
import matplotlib.pyplot as plt
import numpy as np
import torch
from transformers import CLIPTextModel

from starry.utils.config import Configuration
from starry.utils.model_factory import loadModel


config = Configuration.createOrLoad('/safe/training/text/20221022-invwordembed-clip')
model = loadModel(config['model'])
checkpoint = torch.load(config.localPath(config['best']), map_location='cpu')
model.load_state_dict(checkpoint['model'])
model.eval()

text_encoder = CLIPTextModel.from_pretrained(config['model.args.tokenizer_pretrained_path'], subfolder='text_encoder')
embed = text_encoder.text_model.embeddings.token_embedding

torch.nn.functional.softmax(model(embed(torch.tensor([1], dtype=torch.long))), dim=-1)

tensor([[1.2700e-04, 2.4783e-01, 8.4772e-05,  ..., 5.7048e-06, 6.3892e-05,
         1.4520e-04]], grad_fn=<SoftmaxBackward0>)

In [13]:
import json


vocab = json.load(open('pretrained/models--CompVis--stable-diffusion-v1-4/tokenizer/vocab.json', 'r'))
items = [(item[1], item[0]) for item in vocab.items()]
id2word = dict(items)
words = [*vocab.keys()]

for i in range(0, len(words), 256):
	ids = [vocab[words[ii]] for ii in range(i, min(len(words), i + 256))]
	ids2 = torch.argmax(model(embed(torch.tensor(ids, dtype=torch.long))), dim=-1)
	#new_words = [id2word[id.item()] for id in ids2]
	#print('new_words:', new_words)

	for id1, id2 in zip(ids, ids2):
		id2 = id2.item()
		print(f'{"    " if id1 == id2 else "*   "}', id2word[id1], '\t', id2word[id2])


     ! 	 !
     !! 	 !!
     !!! 	 !!!
     !!!! 	 !!!!
     !!!!!!!! 	 !!!!!!!!
     !!!!!!!!!!!!!!!! 	 !!!!!!!!!!!!!!!!
     !!!!!!!!!!!</w> 	 !!!!!!!!!!!</w>
     !!!!!!!!!!</w> 	 !!!!!!!!!!</w>
     !!!!!!!!!</w> 	 !!!!!!!!!</w>
     !!!!!!!!</w> 	 !!!!!!!!</w>
     !!!!!!!</w> 	 !!!!!!!</w>
     !!!!!!</w> 	 !!!!!!</w>
     !!!!!</w> 	 !!!!!</w>
     !!!!</w> 	 !!!!</w>
     !!!"</w> 	 !!!"</w>
     !!!)</w> 	 !!!)</w>
     !!!</w> 	 !!!</w>
     !!"</w> 	 !!"</w>
     !!#</w> 	 !!#</w>
     !!)</w> 	 !!)</w>
     !!</w> 	 !!</w>
     !!@</w> 	 !!@</w>
     !"</w> 	 !"</w>
     !"@</w> 	 !"@</w>
     !#</w> 	 !#</w>
     !'</w> 	 !'</w>
     !),</w> 	 !),</w>
     !).</w> 	 !).</w>
     !)</w> 	 !)</w>
     !*</w> 	 !*</w>
     !,</w> 	 !,</w>
     !-</w> 	 !-</w>
     !...</w> 	 !...</w>
     !..</w> 	 !..</w>
     !.</w> 	 !.</w>
     !:)</w> 	 !:)</w>
     !:</w> 	 !:</w>
     !</w> 	 !</w>
     !?!</w> 	 !?!</w>
     !?!?</w> 	 !?!?</w>
     !?</w> 	 !?</w>
     !@</w> 	 !@</w