In [89]:
import pandas as pd 
from transformers import AutoTokenizer
from collections import defaultdict
import numpy as np
from copy import deepcopy
from tqdm import tqdm

In [2]:
tokenizer_t5 = AutoTokenizer.from_pretrained('t5-base')
tokenizer_mt5 = AutoTokenizer.from_pretrained('google/mt5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [20]:
t5_special_tokens = tokenizer_t5.special_tokens_map['additional_special_tokens']

In [21]:
t5_special_tokens[0]

'<extra_id_0>'

In [22]:
assert len(tokenizer_mt5.tokenize(' '.join(t5_special_tokens))) == len(t5_special_tokens)

In [23]:
tokenizer_mt5.tokenize('<extra_id_99>'), tokenizer_mt5.tokenize('<extra_id_100>')

(['▁<extra_id_99>'], ['▁<', 'extra', '_', 'id', '_100', '>'])

In [32]:
df_attribute = pd.read_csv('/workspaces/multitask-llm-rnd/datasets/data/attribute_extraction_metadata_template/Initial Attribute Definition for First Release - UPDATED SHEET .csv')

In [36]:
df_attribute['attribute_field'].apply(lambda x: ',' in x).any()

False

In [67]:
df_attribute['attribute_field'].apply(lambda x: ':' in x).any()

False

In [76]:
df_attribute['attribute_field'].apply(lambda x: ';' in x).any()

False

In [49]:
vals = []
for i in df_attribute['category_attributevalue'].apply(eval).tolist():
    for j in i:
        if isinstance(j, list):
            assert len(j) == 0
        else:
            vals.append(str(j).lower().strip())

In [50]:
len(set(vals))

8267

In [77]:
any([',' in i for i in vals]), any([';' in i for i in vals]), any([':' in i for i in vals])

(False, False, False)

In [88]:
fnames = [ 
    '/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textandimg_test.json', 
    '/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textandimg_val.json', 
    '/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textandimg_train.json', 
    '/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textonly_test.json', 
    '/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textonly_val.json', 
    '/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textonly_train.json', 
]

In [102]:
for fname in tqdm(fnames):
    df = pd.read_json(fname, lines=True)
    recs = []
    for i in df.to_dict('records'):
        kvdict_i = defaultdict(list)
        for k, v in i['attr_name_value_pairs_normalized']:
            kvdict_i[k.strip().lower()].append(v.strip().lower())
        for k, v in i['attr_name_value_pairs_custom']:
            kvdict_i[k.strip().lower()].append(v.strip().lower())
        ks = list(kvdict_i)
        np.random.shuffle(ks)
        assert len(ks) < len(t5_special_tokens)
        special_token_idx = 0
        question_texts = []
        answer_texts = []
        for ind, k in enumerate(ks):
            question_texts.append(f'{k}: {t5_special_tokens[special_token_idx]}')
            if len(answer_texts) == 0:
                answer_texts.append(f' {t5_special_tokens[special_token_idx]}')
            vs = list(set(kvdict_i[k]))
            np.random.shuffle(vs)
            answer_texts.append(','.join(vs))
            answer_texts.append(f' {t5_special_tokens[special_token_idx+1]}')
            special_token_idx += 1
        question_text = ';'.join(question_texts).strip()
        answer_text = ''.join(answer_texts).strip()
        i['attr_name_value_pairs_all_lower_t5_denoise_question'] = question_text
        i['attr_name_value_pairs_all_lower_t5_denoise_answer'] = answer_text
        recs.append(i)
    df = pd.DataFrame(recs)
    df.to_json(fname.replace('/processed/', '/processed2/').replace('.json', '_t5denoiseformat.json'), lines=True, orient='records')

100%|██████████| 6/6 [00:05<00:00,  1.02it/s]


In [103]:
i

{'label_ordering': 37361,
 'sample_method': 'only_text',
 'pid': '61a15b8d4bdcdf1db5fbd67f',
 'category': 'Home & Garden > Home Storage & Organization > Storage Baskets',
 'title': 'Santa,Elk,Penguin Styles Quality Hand-knitted Merry Xmas Wicker for Candy,Fruit Christmas Storage Basket Candy Box Christmas Present Table Decor',
 'description': 'Size: S:17*12.5 cm;  L: 20*18.5 cm\r\nMaterial:  Wicker + cloth\r\nItem Name: Christmas Storage Basket\r\nPackaging Included: 1 * Christmas Storage Basket\nStyle: Santa, Elk, Snowman, Penguin, Gingerbread Man\r\nNote:\r\nPlease allow a little differences due to manual measurement.\r\nDue to the difference between different monitors,the picture may not reflect the actual color of the item.\r\nThank you!',
 'main_img_url': nan,
 'rater_output_processed': 'Home & Garden > Home Storage & Organization > Storage Baskets > Materials > Wicker\nHome & Garden > Home Storage & Organization > Storage Baskets > Alpha Size > L\nHome & Garden > Home Storage & O

In [104]:
tokenizer_t5.tokenize('alpha size: <extra_id_0>;materials: <extra_id_1>')

['▁al',
 'pha',
 '▁size',
 ':',
 '<extra_id_0>',
 '▁',
 ';',
 'material',
 's',
 ':',
 '<extra_id_1>']

In [105]:
tokenizer_mt5.tokenize('alpha size: <extra_id_0>;materials: <extra_id_1>')

['▁alpha',
 '▁size',
 ':',
 '▁<extra_id_0>',
 ';',
 'material',
 's',
 ':',
 '▁<extra_id_1>']

In [106]:
tokenizer_t5.tokenize('alpha size: <extra_id_0>;materials: <extra_id_1>')

['▁al',
 'pha',
 '▁size',
 ':',
 '<extra_id_0>',
 '▁',
 ';',
 'material',
 's',
 ':',
 '<extra_id_1>']

In [107]:
tokenizer_mt5.tokenize('<extra_id_0>l,s <extra_id_1>wicker <extra_id_2>')

['▁<extra_id_0>',
 'l',
 ',',
 's',
 '▁<extra_id_1>',
 'wick',
 'er',
 '▁<extra_id_2>']