In [1]:
import os, sys
sys.path.insert(0, os.path.abspath(".."))
from kbgen.config import rootdir, defaults_text as config
from kbgen.utils.cli import parse_args
from kbgen.data.datasets import GSM
import torch
from kbgen.data.datasets import DataLoader
from kbgen.utils.tokenizer import GPT2Tokenizer
import tqdm
from kbgen.model.modules import TextEncoder
from kbgen.utils.log import RunTracker

# DATA -----------------------------------------------------
# device = config["device"] if torch.cuda.is_available() else "cpu"
device = "cpu"
dataset = GSM.from_config(config, update=True)
print("Config: ", config)
STRING_COLLECTION = dataset.as_strings()

# %%
tokenizer = GPT2Tokenizer()
tokens, pad_mask = tokenizer(STRING_COLLECTION).values()
pad_mask = (pad_mask == 0).float().masked_fill(pad_mask == 0, float("-inf"))

stringfy = dataset.stringify

Config:  {'d_model': 256, 'd_ff_mult': 2, 'nhead': 2, 'num_layers': 4, 'field_encoder_layers': 2, 'field_decoder_layers': 3, 'num_emb': 'periodic', 'tie_numerical_embeddings': False, 'tie_numerical_decoders': False, 'tie_mask_embeddings': True, 'epochs': 100, 'batch_size': 64, 'lr': 0.0001, 'weight_decay': 0.0, 'dropout': 0.0, 'mask_rate': (-1, 0.5), 'wandb': False, 'tags': ['train-text'], 'device': 'cuda:0', 'seed': 42, 'rootdir': '/checkpoint/nolte/kbgen', 'ckpt': '', 'model': 'decoder-only', 'tokenizer': 'gpt2', 'encoder_readout': 'separate', 'use_mup': False, 'num_fields': 12, 'vocab_size': 50258, 'fields': Fields([('numerical', ['phone.weight', 'phone.height', 'phone.depth', 'phone.width', 'phone.display_size', 'phone.battery', 'phone.launch.day', 'phone.launch.month', 'phone.launch.year']), ('categorical', ['phone.oem', 'phone.network_edge']), ('text', ['phone.model'])]), 'categorical_num_classes': {'phone.oem': 117, 'phone.network_edge': 51}, 'numerical_pad_token_id': -1000, 'ca

In [2]:
STRING_COLLECTION[0]

'phone.model : Vega | phone.oem : Benefon | phone.network_edge : No | phone.weight : 190.00 | phone.display_size : <pad> | phone.height : 145.00 | phone.width : 56.00 | phone.depth : 23.00 | phone.battery : <pad> | phone.launch.year : 1999.00 | phone.launch.day : <pad> | phone.launch.month : <pad>'

In [14]:
for string in STRING_COLLECTION:
  for txt in tokenizer(string)['input_ids'].view(-1, 1):
    element = tokenizer.decode(txt)
    if ("." in element or ":" in element or "|" in element) and len(element.strip()) > 1:
      if element != "<|endoftext|>":
        print("failed:", element.replace(" ", "W"))
        print(string)
  # else:
  #   break

failed: W:)
phone.model : :) Smiley | phone.oem : Samsung | phone.network_edge : Class 10 | phone.weight : 112.80 | phone.display_size : <pad> | phone.height : 99.60 | phone.width : 59.40 | phone.depth : 15.00 | phone.battery : 9.97 | phone.launch.year : 2010.00 | phone.launch.day : <pad> | phone.launch.month : 6.00


In [2]:
def convert_to_types(string:str) -> dict:
  import re
  try:
    def _recursive_destringify(input_:dict):
      for key, value in input_.items():
          if isinstance(value, dict):
            input_[key] = _recursive_destringify(value)
          else:
            try:
              content = re.findall(r"\[(.*?),(.*)\]", value)
              content = content[0]
              type_ = content[0].strip()
              value_ = content[1].strip()
              if type_ == "numerical":
                try:
                  value_ = float(value_)
                except:
                  value_ = float("nan")
              elif type_ == "categorical":
                value_ = dataset.categorical_str_to_id.get(value_, 0)
              input_[key] = value_
            except Exception as e:
              print(e)
              input_[key] = float("nan")
      return input_

    should_be_dict = eval(string)
    return _recursive_destringify(should_be_dict)
  except SyntaxError as e:
    print(e)

print(convert_to_types(dataset.stringfy(dataset._df.iloc[1])))
print(dataset._df.iloc[1].to_dict())

{'phone': {'model': 'nuvifone M10', 'oem': 0, 'network_edge': 0, 'weight': nan, 'height': nan, 'depth': nan, 'width': nan, 'display_size': 8.89, 'battery': 10.55, 'launch': {'day': nan, 'month': 1.0, 'year': 2010.0}}}
{'phone.model': 'nuvifone M10', 'phone.oem': 'Garmin-Asus', 'phone.network_edge': nan, 'phone.weight': nan, 'phone.display_size': 8.89, 'phone.height': nan, 'phone.width': nan, 'phone.depth': nan, 'phone.battery': 10.55170826162069, 'phone.launch.year': 2010.0, 'phone.launch.day': nan, 'phone.launch.month': 1.0}


In [8]:
string = "{ 'a': '[ numerical , 1.0 ]', 'b': '[ categorical , a ]'}"
convert_to_types(string)

{'a': 1.0, 'b': 0}

In [15]:
# tracker = RunTracker.from_logdir(rootdir + "/models/09-06-07-34-19train-textdecoder-only_l4_d256")
tracker = RunTracker.from_logdir(rootdir + "/models/09-06-08-57-44train-textdecoder-only_l4_d256")

In [16]:
model = tracker.load_latest_model()

Loading model from: /checkpoint/nolte/kbgen/models/09-06-08-57-44train-textdecoder-only_l4_d256/99.pt


In [68]:
input_ = dataset.as_strings()[0][:40]
print([tokenizer.decode(i) for i in tokenizer(input_)["input_ids"].view(-1, 1)])
text = list(tokenizer(input_).values())[0][:, :-1]
[tokenizer.decode(i) for i in model.encode(text, is_causal=True).argmax(-1).view(-1, 1)]

['phone', '.', 'model', ' :', ' Vega', ' |', ' phone', '.', 'o', 'em', ' :', ' Benef', 'on', '<|endoftext|>']


['5',
 ' E',
 ' :',
 ' V',
 ' |',
 ' phone',
 '.',
 'o',
 'em',
 ' :',
 ' Sony',
 'on',
 ' |']

In [None]:
def metrics_on_dict():