In [1]:
import sys 
sys.path.append('../../')
from main import LLM
from main_utils import Trie

In [2]:
model = LLM.load_from_checkpoint(
    '../../models/product_title_taxonomy_classification/version_1/epoch=0-step=29648.ckpt')

INFO:root:Unused kwargs when getting t5-base: {}
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
model.eval()
model.cuda()
tokenizer = model.tokenizer

In [4]:

outs = model.transformer.generate(
    **model.tokenizer(
        "Categorize Aliexpress product: " + \
            "ALONG FIT High Waisted Tummy Control Leggings-Yoga-Pants with Pockets Leggings for Women Workout Squat Proof Tights", 
        return_tensors='pt'
    ).to('cuda'), 
    num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50
)
model.tokenizer.batch_decode(outs.sequences), outs.sequences_scores.exp(), outs.sequences_scores.exp().sum()

(["<pad> [women's clothing][bottoms][fashion leggings]</s><pad><pad><pad><pad><pad><pad>",
  "<pad> [women's clothing][active & loungewear][leggings]</s><pad><pad><pad><pad>",
  '<pad> [sports][fitness & body building][yoga][yoga pants]</s><pad><pad><pad>',
  '<pad> [mother & kids][pregnancy & maternity][leggings]</s><pad><pad><pad><pad>',
  '<pad> [sports][sports clothing][sets/suits][trainning & exercise sets]</s>'],
 tensor([0.6398, 0.1505, 0.0250, 0.0215, 0.0206], device='cuda:0'),
 tensor(0.8573, device='cuda:0'))

In [7]:
import yaml 
config = yaml.safe_load(open('../../models/product_title_taxonomy_classification/version_1/config.yaml', 'r'))

In [12]:
config['data']['data_source_yaml_path']

'datasets/product_title_taxonomy_classification/wish-aliexpress.yaml'

In [14]:
import os

In [15]:
data_config = yaml.safe_load(open(
    os.path.join('..','..',config['data']['data_source_yaml_path']), 
'r'))

In [16]:
data_config['test']['path']

{'train': [{'path': 'data/wish_products/Wish_Meta_Train.json',
   'repo': 'git@github.com:ContextLogic/multitask-llm-rnd.git',
   'rev': None,
   'task_prefix': 'Categorize Aliexpress product: '}],
 'val': [{'path': 'data/wish_products/Wish_Meta_Val.json',
   'repo': 'git@github.com:ContextLogic/multitask-llm-rnd.git',
   'rev': None,
   'task_prefix': 'Categorize Aliexpress product: '}],
 'test': [{'path': 'data/wish_products/Wish_Meta_Test.json',
   'repo': 'git@github.com:ContextLogic/multitask-llm-rnd.git',
   'rev': None,
   'task_prefix': 'Categorize Aliexpress product: '}]}

In [18]:
import dvc.api

In [None]:
resource_url = dvc.api.get_url(
    data_config['train'][0]['path'],
    repo=data_config['train'][0]['repo'],
    rev=data_config['train'][0]['rev']
)

In [51]:
import pandas as pd

In [54]:
df = pd.read_json(resource_url, lines=True)

In [None]:
df_tax = pd.read_json(
    dvc.api.get_url(
        'data/taxonomy/wish_newtax.json',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
    ), lines=True
)

In [56]:
paths = df_tax[df_tax['is_leaf']]['category_path'].apply(lambda x: ''.join(['[' + i.strip().lower() + ']' for i in x.split(' > ')])).tolist()

In [57]:
len(paths)

5291

In [58]:
paths_data = list(set(df.category.apply(lambda x: ''.join(['[' + i + ']' for i in x]))))

In [61]:
len(paths_data)

5278

In [62]:
len(set(paths_data).intersection(set(paths))) / len(set(paths_data).union(set(paths)))

0.8422520481087676

In [63]:
trie = Trie([
    [tokenizer.pad_token_id] + tokenizer.encode(i) + [tokenizer.eos_token_id] for i in paths_data
])

In [64]:
def constraint(batch_id, sent):
    return trie.get(sent.tolist())

In [65]:
outs = model.transformer.generate(
    **model.tokenizer(
        "Categorize Aliexpress product: " + \
            "ALONG FIT High Waisted Tummy Control Leggings-Yoga-Pants with Pockets Leggings for Women Workout Squat Proof Tights", 
        return_tensors='pt'
    ).to('cuda'), 
    num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
    prefix_allowed_tokens_fn=constraint
)
model.tokenizer.batch_decode(outs.sequences), outs.sequences_scores.exp(), outs.sequences_scores.exp().sum()

(["<pad> [women's clothing][bottoms][fashion leggings]</s><pad><pad><pad><pad><pad><pad>",
  "<pad> [women's clothing][active & loungewear][leggings]</s><pad><pad><pad><pad>",
  '<pad> [sports][fitness & body building][yoga][yoga pants]</s><pad><pad><pad>',
  '<pad> [mother & kids][pregnancy & maternity][leggings]</s><pad><pad><pad><pad>',
  '<pad> [sports][sports clothing][sets/suits][trainning & exercise sets]</s>'],
 tensor([0.6398, 0.1505, 0.0250, 0.0215, 0.0206], device='cuda:0'),
 tensor(0.8573, device='cuda:0'))

In [66]:
from tqdm import tqdm

In [67]:
df = df.sample(frac=1.0)

In [68]:
recs = []
for rec in tqdm(df.iloc[:200].to_dict('records')[:100]):
    outs = model.transformer.generate(
        **model.tokenizer(
            "Categorize Aliexpress product: " + \
                rec['title'], 
            return_tensors='pt'
        ).to('cuda'), 
        num_beams=2, num_return_sequences=2, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
        prefix_allowed_tokens_fn=constraint
    )
    preds = model.tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
    probs = outs.sequences_scores.exp()
    rec['t5_predicted_categories'] = [pred[1:-1].split('][') for pred in preds]
    rec['t5_predicted_category_prob'] = probs.detach().cpu().tolist()
    recs.append(rec)

100%|██████████| 100/100 [00:50<00:00,  1.98it/s]


In [69]:
df_pred = pd.DataFrame(recs)

In [70]:
df_pred.head(2)

Unnamed: 0,title,category,text,t5_predicted_categories,t5_predicted_category_prob
0,Insoles Arch-Support Foot-Pain for Women/men O...,"[shoes, shoe accessories, insoles]",Insoles Arch-Support Foot-Pain for Women/men O...,"[[shoes, shoe accessories, insoles], [shoes, s...","[0.24975982308387756, 0.20985464751720428]"
1,1Set Tire Fit For 47CC 49cc Mini Dirt Bike Sco...,"[automobiles & motorcycles, motorcycle accesso...",1Set Tire Fit For 47CC 49cc Mini Dirt Bike Sco...,"[[automobiles & motorcycles, motorcycle access...","[0.859704315662384, 0.04090460017323494]"


In [71]:
for depth in range(7):
    tmp = df_pred[df_pred.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.t5_predicted_categories.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  100
# Unique Targets 20
ACC T5:  0.8
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  100
# Unique Targets 62
ACC T5:  0.66
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  100
# Unique Targets 88
ACC T5:  0.46
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  62
# Unique Targets 62
ACC T5:  0.3709677419354839
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  2
# Unique Targets 2
ACC T5:  0.0
----------------------------------------------------------------------------------------------------


In [73]:
depth = 0
tmp = df_pred[df_pred.category.apply(len) > depth]
errors = tmp[(tmp.category.apply(lambda x: ''.join(x[:depth+1])) != \
            tmp.t5_predicted_categories.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None))]

In [82]:
tokenizer.tokenize('hello><-> bye')

['▁hello', '>', '<', '->', '▁by', 'e']

In [76]:
errors[['title', 'category', 't5_predicted_categories']].to_dict('records')

[{'title': 'Board-Game Angels Tarot-Cards-Deck Divination Rider Oracle Manara Modern Witch Romance',
  'category': ['toys & hobbies', 'puzzles & games', 'games', 'card games'],
  't5_predicted_categories': [['entertainment', 'board games'],
   ['toys & hobbies', 'puzzles & games', 'games', 'card games']]},
 {'title': 'Ring-Holder-Stand Crystal Sailor Moon Finger-Cosmic Heart for Phone Crisis Star Random-Style',
  'category': ['beauty & health',
   'makeup',
   'makeup tools & accessories',
   'makeup tool kits'],
  't5_predicted_categories': [['cellphones & telecommunications',
    'mobile phone accessories',
    'phone holders & stands'],
   ['jewelry & accessories', 'rings']]},
 {'title': 'Spaarpot Met Top Gat En Cap',
  'category': ['sports', 'horse racing', 'horse riding boots'],
  't5_predicted_categories': [['apparel accessories',
    "men's hats",
    "men's visors"],
   ['apparel accessories', "women's hats", "women's visors"]]},
 {'title': '77JF الإبداعية عيد الميلاد تقويم الق