In [1]:
import torch
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from random import sample, shuffle, randrange
from datetime import datetime
from pybay.bert import EBertModel, EBertForSequenceClassification, EBertTokenizer, EBertForMaskedLM
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from datasets import Dataset, load_metric, concatenate_datasets, DownloadConfig,load_dataset
from data import preprocess as prep
from experiments import breadcrumb as bc, box_experiments
from data.box_embedding import taxonomy_sampling
from torch.utils.data import RandomSampler, DataLoader, Subset
tokenizer = AutoTokenizer.from_pretrained('/data/ebay/data/jingcshi/bert-base-cased/', model_max_length=512)
clsmodel = AutoModelForSequenceClassification.from_pretrained('/data/ebay/data/jingcshi/bert-base-cased/').to('cuda:0')
maskmodel = AutoModelForMaskedLM.from_pretrained('/data/ebay/data/jingcshi/bert-base-cased/').to('cuda:0')

AllenNLP not available. Registrable won't work.
Some weights of the model checkpoint at /data/ebay/data/jingcshi/bert-base-cased/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertFor

In [24]:
cattree = prep.load_ebay_taxonomy({'data':{'raw_taxonomy_path': './data/2021-10-12', 'breadcrumb_link': 'id'}},pandas=False)
classlist = prep.load_ebay_taxonomy({'data':{'raw_taxonomy_path': './data/2021-10-12', 'breadcrumb_link': 'symbol'}},pandas=True)
catid_table = {}
breadcrumb_table = {}
for i,row in classlist.iterrows():
    catid_table[row['catid']] = i
    breadcrumb_table[row['catid']] = row['breadcrumb']

In [25]:
#raw_data = taxonomy_sampling.informed_sampling(cattree, breadcrumb_table, transitive=False, relax=2, save_dir='./data/box_embedding/taxonomy5.csv')
raw_data = pd.read_csv('./data/box_embedding/taxonomy5.csv',dtype={'cat1':str,'cat2':str,'label':np.int64}).dropna()

In [26]:
def breadcrumb_prep_m(bc):
    segs = bc.replace(', ',' and ').replace('&','and').split(' > ')
    shuffle(segs)
    text = 'A category of products defined by: ' + segs[0]
    if len(segs) > 1:
        for s in segs[1:]:
            text += ', '
            text += s
    return text + '.'

In [54]:
def breadcrumb_prep_k(breadcrumb):
    text = breadcrumb[0]
    return text[:14] + 'commercial ' + text[14:]

In [56]:
data_mb = pd.read_csv('./data/train/fine_tune_data_modified_breadcrumbs_and.csv')
data_kw = pd.read_csv('./data/train/fine_tune_data_keywords.csv')
data_kw['cat1'] = data_kw.apply(lambda row: breadcrumb_prep_k([row['cat1']]), axis=1)
data_kw['cat2'] = data_kw.apply(lambda row: breadcrumb_prep_k([row['cat2']]), axis=1)
#data_kw.to_csv('./data/train/fine_tune_data_altkeywords.csv',index=False)
#data_mb.to_csv('/data/ebay/notebooks/jingcshi/Curation/data/train/fine_tune_data_modified_breadcrumbs_and.csv',index=False)

In [57]:
training_mb = Dataset.from_pandas(data_mb).shuffle(seed=1729)
training_kw = Dataset.from_pandas(data_kw).shuffle(seed=1729)

In [58]:
def tokenize(tokenizer, raw, return_tensors=None):
    return tokenizer(raw["cat1"], raw["cat2"], padding='max_length', truncation=True, return_tensors=return_tensors, max_length=96)

In [59]:
BATCH_SIZE = 16
LEARNING_RATE = 5e-6
NUM_EPOCHS = 5
#tokenized_mb = training_mb.map(lambda x: tokenize(tokenizer, x), batched=True, batch_size=BATCH_SIZE).remove_columns(['cat1','cat2'])
tokenized_kw = training_kw.map(lambda x: tokenize(tokenizer, x), batched=True, batch_size=BATCH_SIZE).remove_columns(['cat1','cat2'])
#eval_mb = tokenized_mb.train_test_split(test_size=0.05)['test']
eval_kw = tokenized_kw.train_test_split(test_size=0.05)['test']

  0%|          | 0/10534 [00:00<?, ?ba/s]

In [61]:
dlcfg = DownloadConfig(proxies={'http':'http://httpproxy.vip.ebay.com:80','https':'http://httpproxy.vip.ebay.com:80'})
metric = load_metric("f1",download_config=dlcfg)

In [62]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [63]:
#training_args_mb = TrainingArguments("./utils/bert/ebert_trainer_mb", per_device_train_batch_size = BATCH_SIZE, num_train_epochs = NUM_EPOCHS, evaluation_strategy="epoch", learning_rate = LEARNING_RATE)
#trainer_mb = Trainer(model=clsmodel, args=training_args_mb, train_dataset=tokenized_mb, eval_dataset=eval_mb, compute_metrics=compute_metrics)
training_args_kw = TrainingArguments("./utils/bert/ebert_trainer_kw", per_device_train_batch_size = BATCH_SIZE, num_train_epochs = NUM_EPOCHS, evaluation_strategy="epoch", learning_rate = LEARNING_RATE)
trainer_kw = Trainer(model=clsmodel, args=training_args_kw, train_dataset=tokenized_kw, eval_dataset=eval_kw, compute_metrics=compute_metrics)

In [64]:
trainer_kw.train()

***** Running training *****
  Num examples = 168540
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 52670


Epoch,Training Loss,Validation Loss,F1
1,0.0004,0.0031,0.99763
2,0.0028,0.000735,0.999407
3,0.0007,0.000626,0.999407
4,0.0011,0.000135,0.999407
5,0.0007,1e-06,1.0


Saving model checkpoint to ./utils/bert/ebert_trainer_kw/checkpoint-500
Configuration saved in ./utils/bert/ebert_trainer_kw/checkpoint-500/config.json
Model weights saved in ./utils/bert/ebert_trainer_kw/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./utils/bert/ebert_trainer_kw/checkpoint-1000
Configuration saved in ./utils/bert/ebert_trainer_kw/checkpoint-1000/config.json
Model weights saved in ./utils/bert/ebert_trainer_kw/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./utils/bert/ebert_trainer_kw/checkpoint-1500
Configuration saved in ./utils/bert/ebert_trainer_kw/checkpoint-1500/config.json
Model weights saved in ./utils/bert/ebert_trainer_kw/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./utils/bert/ebert_trainer_kw/checkpoint-2000
Configuration saved in ./utils/bert/ebert_trainer_kw/checkpoint-2000/config.json
Model weights saved in ./utils/bert/ebert_trainer_kw/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./utils/bert

TrainOutput(global_step=52670, training_loss=0.002353950238801767, metrics={'train_runtime': 6303.1334, 'train_samples_per_second': 133.695, 'train_steps_per_second': 8.356, 'total_flos': 4.1573191190976e+16, 'train_loss': 0.002353950238801767, 'epoch': 5.0})

In [73]:
now = datetime.now().strftime('%Y%m%d-%H%M')
clsmodel.save_pretrained(f'/data/ebay/notebooks/jingcshi/Curation/utils/bert/{now}')

Configuration saved in /data/ebay/notebooks/jingcshi/Curation/utils/bert/20220708-1709/config.json
Model weights saved in /data/ebay/notebooks/jingcshi/Curation/utils/bert/20220708-1709/pytorch_model.bin


In [66]:
input_mb = tokenize(tokenizer,training_mb[157042],return_tensors='pt').to('cuda:0')
torch.softmax(clsmodel(**input_mb).logits.detach().cpu(),dim=1).numpy()

array([[2.8772837e-07, 9.9999976e-01]], dtype=float32)

In [65]:
training_kw[157042]

{'cat1': 'A category of commercial products defined by: business, industrial, cable, conduit, equipment, electrical, block.',
 'cat2': 'A category of commercial products defined by: block, electrical, business, industrial, cable, conduit, equipment.',
 'label': 1}

In [67]:
testdata = pd.read_csv('./data/test/subclass_golden_set.csv').dropna().astype({'cat1':str,'cat2':str})
validcats = list(catid_table.keys())
for i,row in testdata.iterrows():
    if str(row['cat1']) not in validcats or str(row['cat2']) not in validcats:
        testdata = testdata.drop([i])

In [68]:
THRESHOLD = 0.5
#clsmodel = EBertForSequenceClassification.from_pretrained('/data/ebay/notebooks/jingcshi/Curation/utils/bert/20220706-1751/').to('cuda:0')
confusion = {'y': np.array([],dtype=np.int32), 'y_pred': np.array([],dtype=np.int32)}
for _, row in testdata.iterrows():
    confusion['y'] = np.append(confusion['y'], row['label_loose'])
    inputs = tokenizer(breadcrumb_prep_k(breadcrumb_table[row['cat1']]),breadcrumb_prep_k(breadcrumb_table[row['cat2']]), padding='max_length', truncation=True, return_tensors='pt', max_length=96).to('cuda:0')                                                                                 
    pred = (torch.softmax(clsmodel(**inputs).logits.detach(),dim=1).cpu()[:,1] >= THRESHOLD).view(-1)
    confusion['y_pred'] = np.append(confusion['y_pred'], pred)
testresult = testdata
testresult['prediction'] = confusion['y_pred']
test_cm = box_experiments.build_confusion_matrix(confusion)
print(f'Test results:')
display(test_cm)
display(testresult)

Test results:


Unnamed: 0_level_0,"F1=0.7136, precision=0.5547, recall=1.0000","F1=0.7136, precision=0.5547, recall=1.0000"
Unnamed: 0_level_1,Predict subsumption,Predict negative
Label subsumption,71,0
Label negative,57,0


Unnamed: 0,cat1,cat2,label_loose,label_strict,prediction
0,13756,261979,0,0,1
1,13756,261981,0,0,1
2,13756,261983,0,0,1
3,13756,261984,0,0,1
4,261979,261983,0,0,1
...,...,...,...,...,...
131,22966,118985,0,0,1
132,261658,32884,1,0,1
133,118985,33034,1,1,1
134,262366,36028,1,0,1


In [28]:
raw_data

Unnamed: 0,cat1,cat2,label
0,175708,14969,1
1,41404,180008,1
2,119120,185209,1
3,261780,261833,1
4,799,13623,1
...,...,...,...
168535,136,16091,0
168536,32810,48094,0
168537,37964,19177,0
168538,3123,258749,0


In [60]:
data_kw

Unnamed: 0,cat1,cat2,label
0,A category of commercial products defined by: ...,A category of commercial products defined by: ...,1
1,A category of commercial products defined by: ...,A category of commercial products defined by: ...,1
2,A category of commercial products defined by: ...,A category of commercial products defined by: ...,1
3,A category of commercial products defined by: ...,A category of commercial products defined by: ...,1
4,A category of commercial products defined by: ...,A category of commercial products defined by: ...,1
...,...,...,...
168535,A category of commercial products defined by: ...,A category of commercial products defined by: ...,0
168536,A category of commercial products defined by: ...,A category of commercial products defined by: ...,0
168537,A category of commercial products defined by: ...,A category of commercial products defined by: ...,0
168538,A category of commercial products defined by: ...,A category of commercial products defined by: ...,0
