In [1]:
import sys 
sys.path.append('../../')
from main import LLM
from main_utils import Trie

In [5]:
from pytorch_lightning.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict

# lightning deepspeed has saved a directory instead of a file
save_path = '../../models/product_title_taxonomy_classification/version_4/epoch=0-step=545514.ckpt'
output_path = '../../models/product_title_taxonomy_classification/version_4/epoch=0-step=545514.ckpt/pytorch_model.bin'


In [None]:
convert_zero_checkpoint_to_fp32_state_dict(save_path, output_path)

In [6]:
import torch

In [7]:
ckpt = torch.load(output_path)

In [16]:
model = LLM.load_from_checkpoint(output_path, strict=False)

INFO:root:Unused kwargs when getting t5-large: {}
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
  rank_zero_warn(


In [18]:
# fix ckpt missing weights
ckpt = torch.load(
    '../../models/product_title_taxonomy_classification/version_4/epoch=0-step=545514.ckpt/checkpoint/mp_rank_00_model_states.pt', 
    map_location=torch.device('cpu')
)
patch_state_dict = {}
for i in ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight', 'transformer.lm_head.weight']:
    patch_state_dict[i] = ckpt['module']['module.' + i]


In [26]:
for i in patch_state_dict:
    print(i, 'error', ((model.state_dict()[i] - patch_state_dict[i])**2).sum())

transformer.encoder.embed_tokens.weight error tensor(0.)
transformer.decoder.embed_tokens.weight error tensor(0.)
transformer.lm_head.weight error tensor(0.)


In [27]:
model.eval()
model.cuda()
tokenizer = model.tokenizer

In [43]:

outs = model.transformer.generate(
    **model.tokenizer(
        "Top-down categorize Aliexpress product: " + \
            "ALONG FIT High Waisted Tummy Control Leggings-Yoga-Pants with Pockets Leggings for Women Workout Squat Proof Tights", 
        return_tensors='pt'
    ).to('cuda'), 
    num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50
)
model.tokenizer.batch_decode(outs.sequences), outs.sequences_scores.exp(), outs.sequences_scores.exp().sum()

(["<pad> women's clothing > bottoms > fashion leggings</s><pad><pad>",
  '<pad> sports > sports clothing > tights > running tights</s><pad><pad><pad>',
  '<pad> sports > sports clothing > pants > trainning & exercise pants</s>',
  '<pad> sports > fitness & body building > yoga > yoga pants</s><pad><pad>',
  '<pad> sports > racquet sports > tennis > tennis pants</s><pad><pad>'],
 tensor([0.3565, 0.3226, 0.1016, 0.0787, 0.0234], device='cuda:0'),
 tensor(0.8828, device='cuda:0'))

In [44]:

outs = model.transformer.generate(
    **model.tokenizer(
        "Bottom-up categorize Aliexpress product: " + \
            "ALONG FIT High Waisted Tummy Control Leggings-Yoga-Pants with Pockets Leggings for Women Workout Squat Proof Tights", 
        return_tensors='pt'
    ).to('cuda'), 
    num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50
)
model.tokenizer.batch_decode(outs.sequences), outs.sequences_scores.exp(), outs.sequences_scores.exp().sum()

(['<pad> yoga pants <unk> yoga <unk> fitness & body building <unk> sports</s><pad><pad>',
  '<pad> running tights <unk> tights <unk> sports clothing <unk> sports</s><pad><pad><pad>',
  '<pad> trainning & exercise pants <unk> pants <unk> sports clothing <unk> sports</s>',
  "<pad> fashion leggings <unk> bottoms <unk> women's clothing</s><pad><pad><pad>",
  "<pad> pants <unk> bottoms <unk> women's clothing</s><pad><pad><pad><pad><pad><pad>"],
 tensor([0.2948, 0.1508, 0.1357, 0.0937, 0.0472], device='cuda:0'),
 tensor(0.7222, device='cuda:0'))

In [45]:
import yaml 
config = yaml.safe_load(open('../../models/product_title_taxonomy_classification/version_4/config.yaml', 'r'))

In [46]:
config['data']['data_source_yaml_path']

'datasets/product_title_taxonomy_classification/wish-aliexpress.yaml'

In [47]:
import os

In [48]:
data_config = yaml.safe_load(open(
    os.path.join('..','..',config['data']['data_source_yaml_path']), 
'r'))

In [52]:
data_config['test']

[{'path': 'data/wish_products/Wish_Meta_Test.json',
  'repo': 'git@github.com:junwang-wish/query_understanding_data.git',
  'rev': None,
  'task_prefix': 'categorize Aliexpress product: '}]

In [53]:
import dvc.api

In [None]:
resource_url = dvc.api.get_url(
    data_config['train'][0]['path'],
    repo=data_config['train'][0]['repo'],
    rev=data_config['train'][0]['rev']
)

In [55]:
import pandas as pd

In [56]:
df = pd.read_json(resource_url, lines=True)

INFO:aiobotocore.credentials:Found credentials in environment variables.


In [None]:
df_tax = pd.read_json(
    dvc.api.get_url(
        'data/taxonomy/wish_newtax.json',
        repo='git@github.com:junwang-wish/query_understanding_data.git'
    ), lines=True
)

In [82]:
paths = df_tax[df_tax['is_leaf']]['category_path'].apply(lambda x: ' > '.join([i.strip().lower() for i in x.split(' > ')])).tolist()
paths_rev = df_tax[df_tax['is_leaf']]['category_path'].apply(lambda x: ' < '.join([i.strip().lower() for i in x.split(' > ')[::-1]])).tolist()

In [77]:
len(paths), len(paths_rev)

(5291, 5291)

In [75]:
paths_data = list(set(df.category.apply(lambda x: ' > '.join(x))))
paths_data_rev = list(set(df.category.apply(lambda x: ' < '.join(x[::-1]))))

In [78]:
len(paths_data), len(paths_data_rev)

(5278, 5278)

In [84]:
len(set(paths_data).intersection(set(paths))) / len(set(paths_data).union(set(paths)))

0.8422520481087676

In [85]:
len(set(paths_data_rev).intersection(set(paths_rev))) / len(set(paths_data_rev).union(set(paths_rev)))

0.8422520481087676

In [86]:
trie = Trie([
    [tokenizer.pad_token_id] + tokenizer.encode(i) + [tokenizer.eos_token_id] for i in paths_data
])

trie_rev = Trie([
    [tokenizer.pad_token_id] + tokenizer.encode(i) + [tokenizer.eos_token_id] for i in paths_data_rev
])

In [87]:
def constraint(batch_id, sent):
    return trie.get(sent.tolist())

def constraint_rev(batch_id, sent):
    return trie_rev.get(sent.tolist())

In [89]:
outs = model.transformer.generate(
    **model.tokenizer(
        "Top-down categorize Aliexpress product: " + \
            "ALONG FIT High Waisted Tummy Control Leggings-Yoga-Pants with Pockets Leggings for Women Workout Squat Proof Tights", 
        return_tensors='pt'
    ).to('cuda'), 
    num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
    prefix_allowed_tokens_fn=constraint
)
model.tokenizer.batch_decode(outs.sequences), outs.sequences_scores.exp(), outs.sequences_scores.exp().sum()

(["<pad> women's clothing > bottoms > fashion leggings</s><pad><pad>",
  '<pad> sports > sports clothing > tights > running tights</s><pad><pad><pad>',
  '<pad> sports > sports clothing > pants > trainning & exercise pants</s>',
  '<pad> sports > fitness & body building > yoga > yoga pants</s><pad><pad>',
  '<pad> sports > racquet sports > tennis > tennis pants</s><pad><pad>'],
 tensor([0.3565, 0.3226, 0.1016, 0.0787, 0.0234], device='cuda:0'),
 tensor(0.8828, device='cuda:0'))

In [90]:
outs = model.transformer.generate(
    **model.tokenizer(
        "Bottom-up categorize Aliexpress product: " + \
            "ALONG FIT High Waisted Tummy Control Leggings-Yoga-Pants with Pockets Leggings for Women Workout Squat Proof Tights", 
        return_tensors='pt'
    ).to('cuda'), 
    num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
    prefix_allowed_tokens_fn=constraint_rev
)
model.tokenizer.batch_decode(outs.sequences), outs.sequences_scores.exp(), outs.sequences_scores.exp().sum()

(['<pad> yoga pants <unk> yoga <unk> fitness & body building <unk> sports</s><pad><pad>',
  '<pad> running tights <unk> tights <unk> sports clothing <unk> sports</s><pad><pad><pad>',
  '<pad> trainning & exercise pants <unk> pants <unk> sports clothing <unk> sports</s>',
  "<pad> fashion leggings <unk> bottoms <unk> women's clothing</s><pad><pad><pad>",
  "<pad> pants <unk> bottoms <unk> women's clothing</s><pad><pad><pad><pad><pad><pad>"],
 tensor([0.2948, 0.1508, 0.1357, 0.0937, 0.0472], device='cuda:0'),
 tensor(0.7222, device='cuda:0'))

In [91]:
from tqdm import tqdm

In [92]:
df = df.sample(frac=1.0)

In [100]:
recs = []
for rec in tqdm(df.iloc[:200].to_dict('records')[:100]):
    outs = model.transformer.generate(
        **model.tokenizer(
            "Top-down categorize Aliexpress product: " + \
                rec['title'], 
            return_tensors='pt'
        ).to('cuda'), 
        num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
        prefix_allowed_tokens_fn=constraint
    )
    outs_rev = model.transformer.generate(
        **model.tokenizer(
            "Bottom-up categorize Aliexpress product: " + \
                rec['title'], 
            return_tensors='pt'
        ).to('cuda'), 
        num_beams=5, num_return_sequences=5, output_scores=True, return_dict_in_generate=True, length_penalty=0, max_new_tokens=50,
        prefix_allowed_tokens_fn=constraint_rev
    )

    preds = model.tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
    probs = outs.sequences_scores.exp()
    rec['t5_predicted_categories'] = [pred.split(' > ') for pred in preds]
    rec['t5_predicted_category_prob'] = probs.detach().cpu().tolist()

    preds_rev = model.tokenizer.batch_decode(outs_rev.sequences)
    preds_rev = [i.replace('<unk>', '<').replace('</s>', '').replace('<pad>', '').strip() for i in preds_rev]
    probs_rev = outs_rev.sequences_scores.exp()
    rec['t5_predicted_categories_rev'] = [pred.split(' < ') for pred in preds_rev]
    rec['t5_predicted_category_prob_rev'] = probs.detach().cpu().tolist()
    recs.append(rec)

100%|██████████| 100/100 [03:11<00:00,  1.91s/it]


In [101]:
df_pred = pd.DataFrame(recs)

In [102]:
df_pred.head(2)

Unnamed: 0,title,category,text,t5_predicted_categories,t5_predicted_category_prob,t5_predicted_categories_rev,t5_predicted_category_prob_rev
0,8 Colors Strong Hyaluronic Acid Blossom Vitami...,"[beauty & health, makeup, face, primer]",8 Colors Strong Hyaluronic Acid Blossom Vitami...,"[[beauty & health, skin care, face, serum], [b...","[0.3518790304660797, 0.11798278242349625, 0.08...","[[facial sunscreen, sun, skin care, beauty & h...","[0.3518790304660797, 0.11798278242349625, 0.08..."
1,Snow-Socks Hiking-Equipment Self-Heating Magne...,"[sports, camping & hiking, hiking clothings, h...",Snow-Socks Hiking-Equipment Self-Heating Magne...,"[[sports, skiing & snowboarding, skiing socks]...","[0.797848105430603, 0.04138117656111717, 0.023...","[[skiing socks, skiing & snowboarding, sports]...","[0.797848105430603, 0.04138117656111717, 0.023..."


In [112]:
recs = []
for i in df_pred.to_dict('records'):
    tmp = set([tuple(j[::-1]) for j in i['t5_predicted_categories_rev']])
    i['t5_predicted_categories_bidirectional'] = [j for j in i['t5_predicted_categories'] if tuple(j) in tmp]
    recs.append(i)
df_pred = pd.DataFrame(recs)

In [116]:
# top-down prediction
for depth in range(7):
    tmp = df_pred[df_pred.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.t5_predicted_categories.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  100
# Unique Targets 22
ACC T5:  0.88
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  100
# Unique Targets 68
ACC T5:  0.74
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  96
# Unique Targets 93
ACC T5:  0.6666666666666666
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  59
# Unique Targets 58
ACC T5:  0.6440677966101694
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  3
# Unique Targets 3
ACC T5:  1.0
----------------------------------------------------------------------------------------------------


In [117]:
# bottom-up prediction
for depth in range(7):
    tmp = df_pred[df_pred.category.apply(len) > depth]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.t5_predicted_categories_rev.apply(lambda x: ''.join(x[0][::-1][:depth+1]) if len(x[0]) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  100
# Unique Targets 22
ACC T5:  0.85
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  100
# Unique Targets 68
ACC T5:  0.73
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  96
# Unique Targets 93
ACC T5:  0.6354166666666666
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  59
# Unique Targets 58
ACC T5:  0.5932203389830508
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  3
# Unique Targets 3
ACC T5:  0.6666666666666666
----------------------------------------------------------------------------------------------------


In [118]:
# bidirectional set intersection of top-down and bottom-up
for depth in range(7):
    tmp = df_pred[(df_pred.category.apply(len) > depth) & (df_pred.t5_predicted_categories_bidirectional.apply(len) > 0)]
    if len(tmp) > 0:
        print('Depth:', depth)
        print('Size: ', len(tmp))
        print('# Unique Targets', len(set(tmp.category.apply(lambda x: ''.join(x[:depth+1])))))
        print('ACC T5: ', (tmp.category.apply(lambda x: ''.join(x[:depth+1])) == \
            tmp.t5_predicted_categories_bidirectional.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None)).mean())
        print('-' * 100)

Depth: 0
Size:  99
# Unique Targets 22
ACC T5:  0.8888888888888888
----------------------------------------------------------------------------------------------------
Depth: 1
Size:  99
# Unique Targets 68
ACC T5:  0.7474747474747475
----------------------------------------------------------------------------------------------------
Depth: 2
Size:  95
# Unique Targets 92
ACC T5:  0.6736842105263158
----------------------------------------------------------------------------------------------------
Depth: 3
Size:  59
# Unique Targets 58
ACC T5:  0.6440677966101694
----------------------------------------------------------------------------------------------------
Depth: 4
Size:  3
# Unique Targets 3
ACC T5:  1.0
----------------------------------------------------------------------------------------------------


In [119]:
depth = 0
tmp = df_pred[df_pred.category.apply(len) > depth]
errors = tmp[(tmp.category.apply(lambda x: ''.join(x[:depth+1])) != \
            tmp.t5_predicted_categories.apply(lambda x: ''.join(x[0][:depth+1]) if len(x[0]) > depth else None))]

In [120]:
# lots errors are from label not model
errors[['title', 'category', 't5_predicted_categories']].to_dict('records')

[{'title': 'Men Women Heated Insoles USB Rechargeable Winter Warm Heated  Insoles Lightweight Free to',
  'category': ['mother & kids',
   'baby stroller & accessories',
   'lightweight stroller'],
  't5_predicted_categories': [['shoes', 'shoe accessories', 'insoles'],
   ['sports', 'sneakers', 'sneaker accessories'],
   ['home & garden',
    'household merchandises',
    'warming products',
    'warm paste pads'],
   ['home & garden',
    'household merchandises',
    'warming products',
    'electric heating pads'],
   ['home & garden',
    'household merchandises',
    'warming products',
    'electric blankets']]},
 {'title': 'Qianli apollo interstellar um para iphone fixo fotossensível cor original linha de dados detecção código da bateria ler e escrever',
  'category': ['tools', 'tool sets', 'power tool sets'],
  't5_predicted_categories': [['cellphones & telecommunications',
    'mobile phone parts',
    'mobile phone circuits'],
   ['cellphones & telecommunications',
    'mobil