In [7]:
from openai.embeddings_utils import get_embedding
import pandas as pd
from tqdm import tqdm
import numpy as np
from joblib import dump, load
import json

In [8]:
# embedding download in write_appen_product_embedding_to_file.py

In [5]:
%%timeit
get_embedding('text')

263 ms ± 37.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# train simple model

In [9]:
df_emb = pd.read_json('appen_tolabel_product_attr_leftover.json', lines=True).dropna(subset=['product_id'])

In [10]:
assert len(set(df_emb['product_id'])) == len(df_emb)

In [11]:
df_train = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textonly_train.json', lines=True)
df_val = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textonly_val.json', lines=True)
df_test = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_030323_delivered_030623_validated_product_attr_textonly_test.json', lines=True)

In [12]:
assert len(set(df_train['pid'])) == len(df_train)
assert len(set(df_val['pid'])) == len(df_val)
assert len(set(df_test['pid'])) == len(df_test)

In [13]:
df_emb.head(2)

Unnamed: 0,openai_embedding,text,product_id,label_ordering
0,"[0.011446514166891, 0.005374236498028, 0.00731...",Product Title: Rattan Basket Pet Dome and Anim...,611bbb365b0bd8698b670d9d,0.0
1,"[0.0015560899628320002, -0.004062985070049, -0...",Product Title: 10Pcs 7Pcs 2Pcs 1Pcs Tempered G...,61a9c0160dcaf9e1da138df1,1.0


In [14]:
df_train.head(2)

Unnamed: 0,label_ordering,sample_method,pid,category,title,description,main_img_url,rater_output_processed,attr_name_value_pairs_normalized,attr_name_value_pairs_custom,attr_name_value_pairs_normalized_text
0,33187,only_text,618294e8f002fe5ad205758d,Jewelry & Accessories > Fine Jewelry > Bracele...,Amber Bracelet Ladies White Nectar Round Beads...,Style: bracelet/bracelet\nAmber Classification...,,Jewelry & Accessories > Fine Jewelry > Bracele...,"[[Department, Women], [Primary Color, White]]",[],Department|Women\nPrimary Color|White
1,4341,only_text,616eedbf2d6ce57507cf27c3,Sports > Fitness & Body Building > Yoga > Yoga...,YIJIN73 Yoga Pilates Skinny Workout Pants for ...,Size: Please See The Third Picture.\nMaterials...,,Sports > Fitness & Body Building > Yoga > Yoga...,"[[Department, Women], [Materials, Polyester], ...",[],Department|Women\nMaterials|Polyester\nSport o...


In [15]:
len(df_train), len(df_val), len(df_test)

(19424, 6475, 6475)

In [16]:
df_train_emb = df_train.merge(df_emb[['product_id', 'openai_embedding']].rename(columns={'product_id': 'pid'}), how='inner', on='pid')
df_val_emb = df_val.merge(df_emb[['product_id', 'openai_embedding']].rename(columns={'product_id': 'pid'}), how='inner', on='pid')
df_test_emb = df_test.merge(df_emb[['product_id', 'openai_embedding']].rename(columns={'product_id': 'pid'}), how='inner', on='pid')

In [17]:
len(df_train_emb), len(df_val_emb), len(df_test_emb)

(19422, 6473, 6474)

In [18]:
X_train = np.array(df_train_emb['openai_embedding'].to_list())
X_val = np.array(df_val_emb['openai_embedding'].to_list())
X_test = np.array(df_test_emb['openai_embedding'].to_list())

In [19]:
label2id = {}
with open('../../data/attribute_extraction_metadata_template/25L2_unfreetext_attribute_name_value_pairs_02232023.txt', 'r') as f:
    for l in f:
        i = l.replace('\n', '')
        if len(i) > 0:
            label2id[i] = len(label2id)

In [20]:
len(label2id)

6875

In [21]:
id2label = {label2id[i]: i for i in label2id}

In [22]:
y_train = np.zeros((len(X_train), len(label2id)))
y_val = np.zeros((len(X_val), len(label2id)))
y_test = np.zeros((len(X_test), len(label2id)))

In [23]:
for ind, i in enumerate(df_train_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_train[ind, label2id[j]] = 1

for ind, i in enumerate(df_val_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_val[ind, label2id[j]] = 1

for ind, i in enumerate(df_test_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_test[ind, label2id[j]] = 1

In [24]:
y_train.sum(1).mean(), y_val.sum(1).mean(), y_test.sum(1).mean()

(4.370353207702605, 4.358566352541326, 4.363453815261044)

In [25]:
y_test.sum(1).max()

42.0

In [32]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, label_ranking_average_precision_score

In [None]:
clf = OneVsRestClassifier(estimator=SVC(kernel="linear", probability=True), n_jobs=-1, verbose=1)
clf.fit(X_train, y_train) # 3.5 hour

In [28]:
# dump(clf, 'simple_models/product_attrkv_clf_oaiemb_svc_v2.joblib') 

['simple_models/product_attrkv_clf_oaiemb_svc_v2.joblib']

# evaluate simple model

In [29]:
clf = load('simple_models/product_attrkv_clf_oaiemb_svc_v2.joblib')

In [31]:
probas = clf.predict_proba(X_test)

In [33]:
label_ranking_average_precision_score(y_test, probas) # svc

0.644603061560645

In [34]:
report = classification_report(y_test, probas > .5, zero_division=0)
print(report[-500:])

00         0
        6870       0.00      0.00      0.00         0
        6871       0.00      0.00      0.00         0
        6872       0.00      0.00      0.00         0
        6873       0.00      0.00      0.00         0
        6874       0.41      0.21      0.28        52

   micro avg       0.70      0.35      0.47     28249
   macro avg       0.03      0.02      0.02     28249
weighted avg       0.56      0.35      0.41     28249
 samples avg       0.52      0.31      0.35     28249



In [37]:
report = classification_report(y_test, probas > .2, zero_division=0)
print(report[-500:])

00         0
        6870       0.00      0.00      0.00         0
        6871       0.00      0.00      0.00         0
        6872       0.00      0.00      0.00         0
        6873       0.00      0.00      0.00         0
        6874       0.44      0.63      0.52        52

   micro avg       0.57      0.50      0.53     28249
   macro avg       0.03      0.02      0.02     28249
weighted avg       0.49      0.50      0.48     28249
 samples avg       0.49      0.43      0.42     28249



In [44]:
report = classification_report(y_test, probas > .8, zero_division=0)
print(report[-500:])

00         0
        6870       0.00      0.00      0.00         0
        6871       0.00      0.00      0.00         0
        6872       0.00      0.00      0.00         0
        6873       0.00      0.00      0.00         0
        6874       0.50      0.08      0.13        52

   micro avg       0.77      0.22      0.34     28249
   macro avg       0.02      0.01      0.01     28249
weighted avg       0.57      0.22      0.30     28249
 samples avg       0.44      0.20      0.25     28249



In [38]:
res = []
for ind, i in enumerate((-probas).argsort(1)[:,:100]):
    res_i = []
    for j in i:
        res_i.append((id2label[j], probas[ind, j]))
    res.append(res_i)

In [39]:
df_test_emb['top_preds'] = res

In [46]:
df_test_emb[['title', 'description', 'top_preds']].sample(1).to_dict('records') # svc

[{'title': 'Box Dab Rigs Concentrate Pipe With Dome',
  'description': 'welcome to my store!\r\n\r\nMateria: Glass\nShap: Straight Type \nJoint:14mm\nHight:200mm\nWeight:0.23kg\nExcellent design and brilliant quality,there is no doubt that it will give you an unique smoking experience.\nIf there is any problems when you recieve it,please contact with us within 24 hours\n\r\n\r\n  thank you!',
  'top_preds': [('Materials|Glass', 0.18996216448015324),
   ('Number of Hoses|1 Hose', 0.16296538916690603),
   ('Primary Color|Black', 0.06860892822182454),
   ('Primary Color|Multicolor', 0.06436482589514758),
   ('Primary Color|Clear', 0.04926281868251228),
   ('Primary Color|Blue', 0.04073675273612922),
   ('Primary Color|White', 0.03891680603524074),
   ('Materials|Metal', 0.02981260846144161),
   ('Materials|Gold', 0.028718347329157227),
   ('Alpha Size|One Size', 0.02223039981231095),
   ('Age Range Description|Adult', 0.0197758049594304),
   ('Primary Color|Gold', 0.01868982778285254),
  