In [1]:
import pandas as pd
import json
from tqdm import tqdm
import numpy as np
import dvc.api
import zarr
from collections import defaultdict
from itertools import islice
from copy import deepcopy

In [96]:
results = defaultdict(list)

# seqclf

In [86]:
label_map_file = "/workspaces/query_understanding_model/datasets/taxonomy/wish_v1.2.1_newtax_allpaths.txt"

In [87]:
label_map = {}
with open(label_map_file, 'r') as f:
    for l in f:
        l = l.replace('\n', '').strip()
        if len(l):
            label_map[l] = len(label_map)

label_map_rev = {label_map[i]: i for i in label_map}

df_tax = pd.read_json(
    dvc.api.get_url(
        'data/taxonomy/wish_newtax.json',
        repo='git@github.com:junwang-wish/query_understanding_data.git'
    ), lines=True
)
df_tax['category_path'] = df_tax['category_path'].str.lower().str.strip()
df_tax = df_tax[df_tax['category_path'] != '']
df_tax['category_index'] = df_tax['category_path'].apply(lambda x: label_map[x])


In [97]:
df_tax_leaf = df_tax[df_tax.is_leaf]
leaf_index = df_tax_leaf['category_index'].to_numpy()

In [None]:
with zarr.open('/workspaces/query_understanding_model/models/product_title_multitask/version_1/seqclf-epoch=0-step=349636--wish-tahoe-dedup-train-predict-inputonly.zarr', 'r') as z:
    for i in list(z.array_keys()):
        arr = z[i]
        c = 0
        for subarr in tqdm(arr):
            c += 1
            bidx = int(subarr[2])
            if bidx not in results:
                print(bidx)
                logits = subarr[3:]
                pred = leaf_index[logits[leaf_index].argmax()]
                results[bidx].append(pred)

In [122]:
max(results)

103665516.0

In [124]:
len(results)

103665517

# clm

In [128]:
from thefuzz import process as fuzz_process

In [132]:
label_map_list = [i for i in label_map]

In [139]:
fuzz_map_dict = {}

In [141]:
with open('/workspaces/query_understanding_model/models/product_title_multitask/version_1/clm-epoch=0-step=349636--wish-tahoe-dedup-pseudo-test-simpleprompt-topdown.json', 'r') as f:
    for l in tqdm(f, total=310996551):
        dat = json.loads(l)
        bidx = int(dat['batch_indices'])
        if dat['rank_indices'] == 0:
            if bidx in results:
                pred_text = dat['prediction_decoded']
                if pred_text in label_map:
                    pred = label_map[pred_text]
                elif pred_text in fuzz_map_dict:
                    match = fuzz_map_dict[pred_text]
                    pred = label_map[match]
                else:
                    match = fuzz_process.extractOne(pred_text, label_map_list)[0]
                    fuzz_map_dict[pred_text] = match
                    pred = label_map[match]
                pred = int(pred)
                if len(results[bidx]) == 1:
                    results[bidx].append(pred)
                elif len(results[bidx]) == 2:
                    results[bidx][1] = pred 
                else:
                    raise Exception()
            else:
                print(bidx)

100%|██████████| 310996551/310996551 [25:20<00:00, 204488.63it/s]


In [142]:
set([len(results[i]) for i in results])

{2}

In [143]:
len(results)

103665517

In [144]:
# import pickle 
# with open("seqclf_clm_results.pkl", "wb") as f:
#     pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

# emb

In [157]:
df_tax_emb = pd.read_json("/workspaces/query_understanding_model/models/product_title_multitask/version_1/emb-epoch=0-step=349636--wish-newtax-v1.2.1--inputemb.json", 
    lines=True)

In [158]:
df_tax_emb_input = pd.read_json(
    dvc.api.get_url(
        'data/taxonomy/wish_newtax_converted_to_data.json',
        repo='git@github.com:junwang-wish/query_understanding_data.git'
    ), lines=True
)

In [159]:
df_tax_emb_input['category_index'] = df_tax_emb_input['title'].apply(lambda x: label_map[x]).tolist()

In [160]:
df_tax_emb = df_tax_emb.sort_values('batch_indices')

In [164]:
df_tax_emb_input["embedding"] = df_tax_emb.embedding.tolist()

In [165]:
del df_tax_emb

In [168]:
df_tax_emb_input_leaf = df_tax_emb_input[df_tax_emb_input.is_leaf]

In [175]:
leaf_index = df_tax_emb_input_leaf.category_index.to_numpy()
leaf_embs = np.array(df_tax_emb_input_leaf.embedding.tolist())

In [179]:
from torch import nn
import torch

In [185]:
emb_tensor_T = nn.functional.normalize(torch.tensor(leaf_embs).cuda(), 1).T

In [197]:
with zarr.open('/workspaces/query_understanding_model/models/product_title_multitask/version_1/emb-epoch=0-step=349636--wish-tahoe-dedup-pseudo-test--inputemb.zarr', 'r') as z:
    for i in list(z.array_keys()):
        arr = z[i]
        for subarr_start in tqdm(range(0, len(arr), 1000)):
            subarr_end = subarr_start + 1000
            subarr = arr[subarr_start: subarr_end]
            bidxs = subarr[:, 2].astype(int)
            emb_title = subarr[:, 3:]
            preds_inds = nn.functional.normalize(torch.tensor(emb_title).cuda(), 1).mm(
                emb_tensor_T).argmax(1).detach().cpu().numpy()
            preds = leaf_index[preds_inds]
            for bidx, pred in zip(bidxs, preds):
                if bidx in results:
                    if len(results[bidx]) == 2:
                        results[bidx].append(pred) 
                    else:
                        raise Exception()
                else:
                    print(bidx)
                    raise Exception()

100%|██████████| 14810/14810 [12:22<00:00, 19.93it/s]
100%|██████████| 14810/14810 [12:39<00:00, 19.51it/s]
100%|██████████| 14810/14810 [12:16<00:00, 20.10it/s]
100%|██████████| 14810/14810 [12:25<00:00, 19.85it/s]
100%|██████████| 14810/14810 [12:56<00:00, 19.07it/s]
100%|██████████| 14810/14810 [15:41<00:00, 15.73it/s]
100%|██████████| 14810/14810 [14:44<00:00, 16.75it/s]


In [198]:
# import pickle 
# with open("seqclf_clm_emb_results.pkl", "wb") as f:
#     pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

In [199]:
set([len(results[i]) for i in results])

{3}

# save to pandas

In [200]:
import sys 
sys.path.append("../..")

In [201]:
from main_utils import LLMData

In [202]:
LLMData('/workspaces/query_understanding_model/datasets/product_title_taxonomy_classification/wish-tahoe-dedup-pseudo-test-simpleprompt-topdown.yaml').hparams

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


"batch_size":            16
"data_hash":             7163b3e63222d6c970c99c02dfeb73ff
"data_source":           OrderedDict([('is_encoder_decoder', True), ('preprocess', OrderedDict([('transform', 'top-down')])), ('test', [{'path': 'data/wish_products_truetag_tahoe/Wish_Truetag_Tahoe_Meta_Train_Dedup.json', 'repo': 'git@github.com:junwang-wish/query_understanding_data.git', 'rev': None, 'task_prefix': 'categorize product: '}])])
"data_source_yaml_path": /workspaces/query_understanding_model/datasets/product_title_taxonomy_classification/wish-tahoe-dedup-pseudo-test-simpleprompt-topdown.yaml
"max_length":            250
"max_length_out":        100
"model_name":            t5-base
"num_workers":           80
"overwrite_cache":       False
"predict_on_test":       True
"raw_cache_dir":         /data/junwang/.cache/general
"raw_cache_dir_folder":  /data/junwang/.cache/general/7163b3e63222d6c970c99c02dfeb73ff

In [203]:
!ls /data/junwang/.cache/general/7163b3e63222d6c970c99c02dfeb73ff

test.json.gz  test.json.gz.lock


In [204]:
import gzip

In [208]:
c = 0
with gzip.open('/data/junwang/.cache/general/7163b3e63222d6c970c99c02dfeb73ff/test.json.gz', 'r') as f:
    for l in tqdm(f):
        if len(l):
            c += 1

103665517it [04:23, 393302.07it/s]


In [209]:
len(results), c

(103665517, 103665517)

In [223]:
ind = 0
c_allmatch = 0
with gzip.open('/data/junwang/.cache/general/7163b3e63222d6c970c99c02dfeb73ff/test.json.gz', 'r') as f, \
        gzip.open('wish_tahoe_title_dedup_multitask_v0.1_preds.json.gz', 'w') as f_out, \
        gzip.open('wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch.json.gz', 'w') as f_out_allmatch:
    for l in tqdm(f, total=103665517):
        if len(l):
            dat = json.loads(l)
            title = dat['text_input'][29:]
            truetag = dat['text_output'].split(" > ")
            preds = results[ind]
            if preds[0] == preds[1] == preds[2]:
                all_match = True
                c_allmatch += 1
            else:
                all_match = False
            preds_cat = [label_map_rev[i].split(" > ") for i in preds]
            seqclf, clm, emb = preds_cat
            out_dat = {
                "title": title,
                "category_truetag": truetag,
                "multitask_seqclf_v0.1_predicted_category": seqclf,
                "multitask_emb_v0.1_predicted_category": emb,
                "multitask_clm_v0.1_predicted_category": clm,
                "all_match": all_match
            }
            ind += 1
            out_text = (json.dumps(out_dat) + "\n").encode("utf-8")
            f_out.write(out_text)
            if all_match:
                f_out_allmatch.write(out_text)

100%|██████████| 103665517/103665517 [2:41:53<00:00, 10672.06it/s] 


In [224]:
!du -sh *.json.gz

8.1G	wish_tahoe_title_dedup_multitask_v0.1_preds.json.gz
2.3G	wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch.json.gz


In [226]:
!zcat wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch.json.gz | head -n 1 | jq .

{
  "title": "1997 Action Packed #118 Bryan Cox DD NFL Football Trading Card",
  "category_truetag": [
    "hobbies",
    "collectible items"
  ],
  "multitask_seqclf_v0.1_predicted_category": [
    "toys & hobbies",
    "hobby & collectibles",
    "game collection cards"
  ],
  "multitask_emb_v0.1_predicted_category": [
    "toys & hobbies",
    "hobby & collectibles",
    "game collection cards"
  ],
  "multitask_clm_v0.1_predicted_category": [
    "toys & hobbies",
    "hobby & collectibles",
    "game collection cards"
  ],
  "all_match": true
}

gzip: stdout: Broken pipe


# split files

In [2]:
df_allmatch = pd.read_json('/workspaces/query_understanding_model/datasets/wish_tahoe_dedup_pseudolabel/wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch.json.gz', 
    lines=True)

In [4]:
df_allmatch = df_allmatch.rename(columns={i: i.replace('v0.1', 'v0_1') for i in df_allmatch})

In [5]:
df_allmatch.head()

Unnamed: 0,title,category_truetag,multitask_seqclf_v0_1_predicted_category,multitask_emb_v0_1_predicted_category,multitask_clm_v0_1_predicted_category,all_match
0,1997 Action Packed #118 Bryan Cox DD NFL Footb...,"[hobbies, collectible items]","[toys & hobbies, hobby & collectibles, game co...","[toys & hobbies, hobby & collectibles, game co...","[toys & hobbies, hobby & collectibles, game co...",True
1,2020 Summer T Shirt Tshirt -Lifting is Cheaper...,"[fashion, tops]","[sports, sports clothing, shirts, trainning & ...","[sports, sports clothing, shirts, trainning & ...","[sports, sports clothing, shirts, trainning & ...",True
2,"3PCS Car Seat Cushions, Peel and Stick Pink Fl...","[automotive, parts & accessories]","[automobiles & motorcycles, interior accessori...","[automobiles & motorcycles, interior accessori...","[automobiles & motorcycles, interior accessori...",True
3,500 Pieces Educational Puzzle Games Toys Cute ...,"[games, hobbies, puzzles]","[toys & hobbies, puzzles & games, puzzles]","[toys & hobbies, puzzles & games, puzzles]","[toys & hobbies, puzzles & games, puzzles]",True
4,Antione Bar Name Necklace Gold and Silver Colo...,"[accessories, necklaces]","[jewelry & accessories, customized jewelry, cu...","[jewelry & accessories, customized jewelry, cu...","[jewelry & accessories, customized jewelry, cu...",True


In [6]:
df_allmatch[['title', 'multitask_seqclf_v0_1_predicted_category']].sample(5).to_dict('records')

[{'title': 'Metallic Sewing Buttons Decorative On Leather Vintage Black For Brass Supplies Craft Snap Fastener D = 21mm 5pcs Silver Color',
  'multitask_seqclf_v0_1_predicted_category': ['home & garden',
   'arts, crafts & sewing',
   'apparel sewing & fabric',
   'buttons']},
 {'title': 'Print-Floral-Patterns Round Mouse Pad 7.9X7.9 In Black 4 Pcs',
  'multitask_seqclf_v0_1_predicted_category': ['computer & office',
   'mouse & keyboards',
   'mouse pads']},
 {'title': 'GFA Return to Forever * CHICK COREA * Signed Autograph 11x14 Photo PROOF C1',
  'multitask_seqclf_v0_1_predicted_category': ['home & garden',
   'home decor',
   'photo albums']},
 {'title': 'New Jesus Stainless Steel Round Cross Pendant',
  'multitask_seqclf_v0_1_predicted_category': ['home & garden',
   'home decor',
   'christian decor',
   'wall crosses']},
 {'title': "- Women's Clothing V-Neck Lace Dress Long Sleeve Sideslit Solid Color Evening Dress Pull Strip Stitching Sexy Two-Piece Ladies Dress Set",
  'multit

In [11]:
len(df_allmatch)

31846838

In [12]:
df_allmatch = df_allmatch[(df_allmatch.category_truetag.apply(lambda x: len(x) > 0)) & \
    (df_allmatch.multitask_seqclf_v0_1_predicted_category.apply(lambda x: len(x) > 0))]

In [13]:
len(df_allmatch)

31846838

In [14]:
df_allmatch = df_allmatch.sample(frac=1.0, random_state=42)

In [15]:
import numpy as np

In [16]:
valtest_size = 10000

In [17]:
df_allmatch_train = df_allmatch.head(len(df_allmatch) - valtest_size)
df_allmatch_valtest = df_allmatch.tail(valtest_size)
df_allmatch_val, df_allmatch_test = np.array_split(df_allmatch_valtest, 2)

In [22]:
len(df_allmatch_train) + len(df_allmatch_val) + len(df_allmatch_test), len(df_allmatch)

(31846838, 31846838)

In [23]:
len(df_allmatch_train)

31836838

In [20]:
df_allmatch_train_chunk_1, df_allmatch_train_chunk_2, df_allmatch_train_chunk_3 = np.array_split(df_allmatch_train, 3)

In [24]:
len(df_allmatch_train_chunk_1)

10612280

In [25]:
df_allmatch_train_chunk_1[['title', 'category_truetag', 'multitask_seqclf_v0_1_predicted_category']].to_json(
    'wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Train_Chunk_1.json', lines=True, orient='records')
df_allmatch_train_chunk_2[['title', 'category_truetag', 'multitask_seqclf_v0_1_predicted_category']].to_json(
    'wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Train_Chunk_2.json', lines=True, orient='records')
df_allmatch_train_chunk_3[['title', 'category_truetag', 'multitask_seqclf_v0_1_predicted_category']].to_json(
    'wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Train_Chunk_3.json', lines=True, orient='records')
df_allmatch_val[['title', 'category_truetag', 'multitask_seqclf_v0_1_predicted_category']].to_json(
    'wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Val.json', lines=True, orient='records')
df_allmatch_test[['title', 'category_truetag', 'multitask_seqclf_v0_1_predicted_category']].to_json(
    'wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Test.json', lines=True, orient='records')

In [28]:
!du -sh *.json

1.4M	wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Test.json
2.8G	wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Train_Chunk_1.json
2.8G	wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Train_Chunk_2.json
2.8G	wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Train_Chunk_3.json
1.4M	wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Val.json


In [29]:
!head -n 1 wish_tahoe_title_dedup_multitask_v0.1_preds_allmatch_Train_Chunk_1.json | jq .

{
  "title": "Diy Forest Mushroom Tales Panel Waterfall Koi Fish Jungle Green Meadow Elf Night View Case Ipad Air 4 Leather Case For Case Ipad 2020 / Air 4 / 2020 Pro 11/12.9 In Ipad 2020 / Air 4 / 2020 Pro 11/12.9 In",
  "category_truetag": [
    "gadgets",
    "ipad cases & covers"
  ],
  "multitask_seqclf_v0_1_predicted_category": [
    "computer & office",
    "tablet accessories",
    "tablets & e-books case"
  ]
}
