In [85]:
import numpy as np

In [1]:
import sys 
sys.path.append('../../')
from main import LLM
from main_utils import Trie

In [2]:
from pytorch_lightning.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict

# lightning deepspeed has saved a directory instead of a file
save_path = '../../models/product_title_taxonomy_classification/version_4/epoch=0-step=545514.ckpt'
output_path = '../../models/product_title_taxonomy_classification/version_4/epoch=0-step=545514.ckpt/pytorch_model.bin'


In [3]:
# convert_zero_checkpoint_to_fp32_state_dict(save_path, output_path)

In [4]:
model = LLM.load_from_checkpoint(output_path, strict=False)

INFO:root:Unused kwargs when getting t5-large: {}
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
  rank_zero_warn(


In [5]:
model.eval()
model.cuda()
tokenizer = model.tokenizer

In [6]:
import dvc.api

In [None]:
resource_url = dvc.api.get_url(
    'data/taxonomy/wish_newtax.json',
    repo='git@github.com:junwang-wish/query_understanding_data.git'
)

In [8]:
import pandas as pd

In [9]:
df_tax = pd.read_json(resource_url, lines=True)

INFO:aiobotocore.credentials:Found credentials in environment variables.


In [10]:
df_tax.head(2)

Unnamed: 0,category_tree_version,id,category_path,is_leaf
0,v1.2.1,4658,Security & Protection > Building Automation,True
1,v1.2.1,4659,Security & Protection > Door Intercom,False


In [11]:
paths = df_tax[df_tax['is_leaf']]['category_path'].apply(lambda x: ' > '.join([i.strip().lower() for i in x.split(' > ')])).tolist()

In [12]:
len(paths)

5291

In [13]:
trie = Trie([
    [tokenizer.pad_token_id] + tokenizer.encode(i) + [tokenizer.eos_token_id] for i in paths
])

In [14]:
def constraint(batch_id, sent):
    return trie.get(sent.tolist())

In [15]:
outs = model.transformer.generate(
    **model.tokenizer(
        "Top-down categorize Aliexpress product: " + \
            "ALONG FIT High Waisted Tummy Control Leggings-Yoga-Pants with Pockets Leggings for Women Workout Squat Proof Tights", 
        return_tensors='pt'
    ).to('cuda'), 
    num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
    prefix_allowed_tokens_fn=constraint
)
model.tokenizer.batch_decode(outs.sequences), outs.sequences_scores.exp(), outs.sequences_scores.exp().sum()

(["<pad> women's clothing > bottoms > fashion leggings</s><pad><pad>",
  '<pad> sports > sports clothing > tights > running tights</s><pad><pad><pad>',
  '<pad> sports > sports clothing > pants > trainning & exercise pants</s>',
  '<pad> sports > fitness & body building > yoga > yoga pants</s><pad><pad>',
  '<pad> sports > racquet sports > tennis > tennis pants</s><pad><pad>'],
 tensor([0.3565, 0.3226, 0.1016, 0.0787, 0.0234], device='cuda:0'),
 tensor(0.8828, device='cuda:0'))

In [16]:
df = pd.read_json(
    dvc.api.get_url(
        'data/wish_products/wish-mturk-labelled-09202022-clean-joinedlance.json',
        repo='git@github.com:junwang-wish/query_understanding_data.git'
    ), lines=True
)

In [17]:
df.head(2)

Unnamed: 0,pid,title,category,text,lance_predicted_category_id,lance_predicted_category,lance_predicted_is_leaf
0,5ce7ad18c04b4c486820a407,Fantasy Master - 681 - sv√§rd,"[education & office supplies, cutting supplies...",Fantasy Master - 681 - sv√§rd -> [education & ...,2705,"[home & garden, home decor, ornaments, swords]",True
1,610dee5b63838066f029717e,Silent Red Thing Silent Hill Pyramid Head Horr...,"[home & garden, home textile, bedding, blankets]",Silent Red Thing Silent Hill Pyramid Head Horr...,2784,"[home & garden, home textile, bedding, throws]",True


In [18]:
df['category_text'] = df['category'].apply(lambda x: ' > '.join(x))
assert df['category_text'].apply(lambda x: x in paths).all()

In [19]:
from tqdm import tqdm

In [22]:
recs = []
for rec in tqdm(df.to_dict('records')):
    outs = model.transformer.generate(
        **model.tokenizer(
            "Top-down categorize Aliexpress product: " + \
                rec['title'], 
            return_tensors='pt'
        ).to('cuda'), 
        num_beams=2, num_return_sequences=2, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
        prefix_allowed_tokens_fn=constraint
    )
    preds = model.tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
    probs = outs.sequences_scores.exp()
    rec['t5_predicted_categories'] = [pred.split(' > ') for pred in preds]
    rec['t5_predicted_category_prob'] = probs.detach().cpu().tolist()
    recs.append(rec)

100%|██████████| 6401/6401 [1:39:21<00:00,  1.07it/s]  


In [49]:
recs_rev = []
for rec in tqdm(df.to_dict('records')):
    outs = model.transformer.generate(
        **model.tokenizer(
            "Bottom-up categorize Aliexpress product: " + \
                rec['title'], 
            return_tensors='pt'
        ).to('cuda'), 
        num_beams=2, num_return_sequences=2, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
        prefix_allowed_tokens_fn=constraint
    )
    preds = model.tokenizer.batch_decode(outs.sequences)
    preds = [i.replace('<unk>', '<').replace('</s>', '').replace('<pad>', '').strip() for i in preds]
    probs = outs.sequences_scores.exp()
    rec['t5_predicted_categories_rev'] = [pred.split(' < ') for pred in preds]
    rec['t5_predicted_category_prob_rev'] = probs.detach().cpu().tolist()
    recs_rev.append(rec)

100%|██████████| 6401/6401 [1:25:16<00:00,  1.25it/s]


In [61]:
df_pred_rev = pd.DataFrame(recs_rev)
df_pred_rev['t5_predicted_categories_rev'] = df_pred_rev['t5_predicted_categories_rev'].apply(lambda x: [i[0].split(' > ') for i in x])

In [64]:
df_pred_rev.head(2)

Unnamed: 0,pid,title,category,text,lance_predicted_category_id,lance_predicted_category,lance_predicted_is_leaf,category_text,t5_predicted_categories_rev,t5_predicted_category_prob_rev
0,5ce7ad18c04b4c486820a407,Fantasy Master - 681 - sv√§rd,"[education & office supplies, cutting supplies...",Fantasy Master - 681 - sv√§rd -> [education & ...,2705,"[home & garden, home decor, ornaments, swords]",True,education & office supplies > cutting supplies...,"[[electronic components & supplies, electronic...","[1.0617370094223588e-07, 4.7696129712448965e-08]"
1,610dee5b63838066f029717e,Silent Red Thing Silent Hill Pyramid Head Horr...,"[home & garden, home textile, bedding, blankets]",Silent Red Thing Silent Hill Pyramid Head Horr...,2784,"[home & garden, home textile, bedding, throws]",True,home & garden > home textile > bedding > blankets,"[[women's clothing, tops, blouses & shirts], [...","[1.2485299523846277e-13, 1.2113937240473827e-13]"


In [62]:
df_pred = pd.DataFrame(recs)

In [63]:
df_pred.head(2)

Unnamed: 0,pid,title,category,text,lance_predicted_category_id,lance_predicted_category,lance_predicted_is_leaf,category_text,t5_predicted_categories,t5_predicted_category_prob
0,5ce7ad18c04b4c486820a407,Fantasy Master - 681 - sv√§rd,"[education & office supplies, cutting supplies...",Fantasy Master - 681 - sv√§rd -> [education & ...,2705,"[home & garden, home decor, ornaments, swords]",True,education & office supplies > cutting supplies...,"[[consumer electronics, portable audio & video...","[0.020152555778622627, 0.012340139597654343]"
1,610dee5b63838066f029717e,Silent Red Thing Silent Hill Pyramid Head Horr...,"[home & garden, home textile, bedding, blankets]",Silent Red Thing Silent Hill Pyramid Head Horr...,2784,"[home & garden, home textile, bedding, throws]",True,home & garden > home textile > bedding > blankets,"[[home & garden, home textile, bedding, blanke...","[0.2627204656600952, 1.4428826785716636e-13]"


In [74]:
df_join = df_pred.merge(df_pred_rev[['pid', 't5_predicted_categories_rev', 't5_predicted_category_prob_rev']], on='pid', how='inner')

In [77]:
recs_join_nonempty = []
for i in df_join.to_dict('records'):
    i['t5_predicted_categories_join'] = [
        j for j in i['t5_predicted_categories'] if tuple(j) in set([tuple(p) for p in i['t5_predicted_categories_rev']])
    ]
    
    if len(i['t5_predicted_categories_join']):
        recs_join_nonempty.append(i)
df_join_nonempty = pd.DataFrame(recs_join_nonempty)

In [78]:
len(df_join_nonempty), len(df_join)

(3981, 6401)

In [65]:
for depth in range(7):
    tmp = df_pred[df_pred.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.t5_predicted_categories.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)).mean())
        print('ACC Lance: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.lance_predicted_category.apply(lambda x: ''.join(x[:depth+1]) if len(x) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  6401
# Unique Targets 25
ACC T5:  0.743321356038119
ACC Lance:  0.8042493360412436
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  6401
# Unique Targets 203
ACC T5:  0.5425714732073114
ACC Lance:  0.6231838775191376
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  6401
# Unique Targets 868
ACC T5:  0.4252460553038588
ACC Lance:  0.5063271363849399
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  2475
# Unique Targets 869
ACC T5:  0.3107070707070707
ACC Lance:  0.4
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  27
# Unique Targets 18
ACC T5:  0.3333333333333333
ACC Lance:  0.4074074074074074
----------------------------------------------------------------------------------------------------


In [69]:
for depth in range(7):
    tmp = df_pred_rev[df_pred_rev.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5 reverse: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.t5_predicted_categories_rev.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)).mean())
        print('ACC Lance: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.lance_predicted_category.apply(lambda x: ''.join(x[:depth+1]) if len(x) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  6401
# Unique Targets 25
ACC T5 reverse:  0.5438212779253242
ACC Lance:  0.8042493360412436
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  6401
# Unique Targets 203
ACC T5 reverse:  0.3371348226839556
ACC Lance:  0.6231838775191376
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  6401
# Unique Targets 868
ACC T5 reverse:  0.2729261052960475
ACC Lance:  0.5063271363849399
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  2475
# Unique Targets 869
ACC T5 reverse:  0.19757575757575757
ACC Lance:  0.4
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  27
# Unique Targets 18
ACC T5 reverse:  0.25925925925925924
ACC Lance:  0.4074074074074074
--------------------------------------------------------------------

In [80]:
for depth in range(7):
    tmp = df_join_nonempty[df_join_nonempty.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5 Join: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.t5_predicted_categories_join.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)).mean())
        print('ACC Lance: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.lance_predicted_category.apply(lambda x: ''.join(x[:depth+1]) if len(x) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  3981
# Unique Targets 24
ACC T5 Join:  0.8156242150213514
ACC Lance:  0.8332077367495604
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  3981
# Unique Targets 175
ACC T5 Join:  0.5820145692037176
ACC Lance:  0.6367746797287114
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  3981
# Unique Targets 667
ACC T5 Join:  0.46797287113790503
ACC Lance:  0.528761617683999
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  1404
# Unique Targets 596
ACC T5 Join:  0.35327635327635326
ACC Lance:  0.4066951566951567
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  19
# Unique Targets 15
ACC T5 Join:  0.42105263157894735
ACC Lance:  0.47368421052631576
-------------------------------------------------------------------

In [70]:
for depth in range(7):
    tmp = df_pred[df_pred.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5 (2 beams): ', (
            (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
                tmp.t5_predicted_categories.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)) | \
            (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
                tmp.t5_predicted_categories.apply(lambda x: ''.join(x[1][:depth+1]) if len(x[1]) > depth else None))
        ).mean())
        print('ACC Lance: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.lance_predicted_category.apply(lambda x: ''.join(x[:depth+1]) if len(x) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  6401
# Unique Targets 25
ACC T5 (2 beams):  0.8034682080924855
ACC Lance:  0.8042493360412436
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  6401
# Unique Targets 203
ACC T5 (2 beams):  0.6627089517262927
ACC Lance:  0.6231838775191376
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  6401
# Unique Targets 868
ACC T5 (2 beams):  0.5702234025933448
ACC Lance:  0.5063271363849399
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  2475
# Unique Targets 869
ACC T5 (2 beams):  0.42343434343434344
ACC Lance:  0.4
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  27
# Unique Targets 18
ACC T5 (2 beams):  0.4444444444444444
ACC Lance:  0.4074074074074074
-----------------------------------------------------------

In [71]:
for depth in range(7):
    tmp = df_pred_rev[df_pred_rev.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5 reverse (2 beams): ', (
            (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
                tmp.t5_predicted_categories_rev.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)) | \
            (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
                tmp.t5_predicted_categories_rev.apply(lambda x: ''.join(x[1][:depth+1]) if len(x[1]) > depth else None))
        ).mean())
        print('ACC Lance: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.lance_predicted_category.apply(lambda x: ''.join(x[:depth+1]) if len(x) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  6401
# Unique Targets 25
ACC T5 reverse (2 beams):  0.572098109670364
ACC Lance:  0.8042493360412436
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  6401
# Unique Targets 203
ACC T5 reverse (2 beams):  0.4403999375097641
ACC Lance:  0.6231838775191376
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  6401
# Unique Targets 868
ACC T5 reverse (2 beams):  0.3775972504296204
ACC Lance:  0.5063271363849399
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  2475
# Unique Targets 869
ACC T5 reverse (2 beams):  0.26666666666666666
ACC Lance:  0.4
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  27
# Unique Targets 18
ACC T5 reverse (2 beams):  0.4074074074074074
ACC Lance:  0.4074074074074074
--------------------

In [88]:
for depth in range(7):
    tmp = df_join_nonempty[df_join_nonempty.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        hits = []
        for i in tmp.to_dict('records'):
            if tuple(i['category'][:depth+1]) in [tuple(j[:depth+1]) for j in i['t5_predicted_categories_join']]:
                hits.append(1)
            else:
                hits.append(0)
        print('ACC T5 join (2 beams): ', np.mean(hits))
        print('ACC Lance: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.lance_predicted_category.apply(lambda x: ''.join(x[:depth+1]) if len(x) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  3981
# Unique Targets 24
ACC T5 join (2 beams):  0.8234112032152725
ACC Lance:  0.8332077367495604
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  3981
# Unique Targets 175
ACC T5 join (2 beams):  0.635769907058528
ACC Lance:  0.6367746797287114
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  3981
# Unique Targets 667
ACC T5 join (2 beams):  0.5398141170560161
ACC Lance:  0.528761617683999
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  1404
# Unique Targets 596
ACC T5 join (2 beams):  0.4408831908831909
ACC Lance:  0.4066951566951567
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  19
# Unique Targets 15
ACC T5 join (2 beams):  0.5263157894736842
ACC Lance:  0.47368421052631576
---------------------

In [44]:
# lance correct, t5 beam-0 incorrect
df_pred[(df_pred.category.apply(lambda x: x[0]) == df_pred.lance_predicted_category.apply(lambda x: x[0])) & \
     (df_pred.category.apply(lambda x: x[0]) != df_pred.t5_predicted_categories.apply(lambda x: x[0][0]))][[ 
        'title', 'category', 'lance_predicted_category', 't5_predicted_categories', 't5_predicted_category_prob'
     ]].sample(10, random_state=42).to_dict('records')

[{'title': 'Snowboarder Necklace, Snowboarder Charm, Snowboarder Pendant, Snowboarder Jewelry, Snowboard Jewelry, Snowboard Charm, Snowboard Gifts',
  'category': ['jewelry & accessories',
   'customized jewelry',
   'customized necklaces'],
  'lance_predicted_category': ['jewelry & accessories',
   'customized jewelry',
   'customized pendants'],
  't5_predicted_categories': [['sports', 'skiing & snowboarding', 'ski poles'],
   ['sports', 'skiing & snowboarding', 'skiing gloves']],
  't5_predicted_category_prob': [0.13644839823246002, 0.05306606739759445]},
 {'title': 'Detergente em p√≥ omo lavagem perfeita ativo 1,6kg',
  'category': ['home & garden',
   'household cleaning',
   'household chemicals',
   'laundry detergent'],
  'lance_predicted_category': ['home & garden',
   'household cleaning',
   'household chemicals',
   'laundry detergent'],
  't5_predicted_categories': [['automobiles & motorcycles',
    'car wash & maintenance',
    'paint care',
    'paint cleaner'],
   ['aut

In [45]:
# lance correct, t5 beam-1 correct but beam-0 incorrect
df_pred[(df_pred.category.apply(lambda x: x[0]) == df_pred.lance_predicted_category.apply(lambda x: x[0])) & \
     (df_pred.category.apply(lambda x: x[0]) != df_pred.t5_predicted_categories.apply(lambda x: x[0][0])) & \
     (df_pred.category.apply(lambda x: x[0]) == df_pred.t5_predicted_categories.apply(lambda x: x[1][0]))
     ][[ 
        'title', 'category', 'lance_predicted_category', 't5_predicted_categories', 't5_predicted_category_prob'
     ]].sample(10, random_state=42).to_dict('records')

[{'title': 'Natural Veg Tanned Cowhide Purse Wallet Shoe Lining First Layer Cowhide Leather DIY Craft 2mm Thick',
  'category': ['home & garden',
   'arts, crafts & sewing',
   'apparel sewing & fabric',
   'synthetic leather'],
  'lance_predicted_category': ['home & garden',
   'arts, crafts & sewing',
   'apparel sewing & fabric',
   'genuine leather'],
  't5_predicted_categories': [['luggage & bags', 'bag parts & accessories'],
   ['home & garden',
    'arts, crafts & sewing',
    'leathercraft',
    'leathercraft accessories']],
  't5_predicted_category_prob': [0.003665035590529442,
   1.5833576116897063e-10]},
 {'title': 'HOT Super Brand Mens Underwear Boxer Shorts Mens Trunks Breathable Ice Silk Male Panties Underpants Underwear',
  'category': ["men's clothing",
   'underwear & sleepwear',
   "men's underwear",
   'boxers'],
  'lance_predicted_category': ["men's clothing",
   'underwear & sleepwear',
   "men's underwear",
   'boxers'],
  't5_predicted_categories': [['sports',
  