In [1]:
from openai.embeddings_utils import get_embedding
import pandas as pd
from tqdm import tqdm
import numpy as np

In [7]:
engine = 'text-embedding-ada-002'

In [9]:
len(get_embedding('text', engine=engine))

1536

In [11]:
df_train = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_train.json', 
    lines=True)
df_val = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_val.json', 
    lines=True)
df_test = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_test.json', 
    lines=True)

In [14]:
recs = []
for i in tqdm(df_test.to_dict('records')):
    i['openai_embedding'] = get_embedding(i['query'], engine=engine)
    recs.append(i)

df_test_emb = pd.DataFrame(recs)

100%|██████████| 8840/8840 [18:39<00:00,  7.89it/s]  


In [16]:
recs = []
for i in tqdm(df_val.to_dict('records')):
    i['openai_embedding'] = get_embedding(i['query'], engine=engine)
    recs.append(i)

df_val_emb = pd.DataFrame(recs)

100%|██████████| 8839/8839 [16:27<00:00,  8.95it/s]  


In [17]:
recs = []
for i in tqdm(df_train.to_dict('records')):
    i['openai_embedding'] = get_embedding(i['query'], engine=engine)
    recs.append(i)

df_train_emb = pd.DataFrame(recs)

100%|██████████| 26518/26518 [1:04:01<00:00,  6.90it/s]  


In [18]:
df_train_emb.to_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_train_oaiemb.json', 
    lines=True, orient='records')
df_val_emb.to_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_val_oaiemb.json', 
    lines=True, orient='records')
df_test_emb.to_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_test_oaiemb.json', 
    lines=True, orient='records')

In [19]:
len(df_train_emb), len(df_val_emb), len(df_test_emb)

(26518, 8839, 8840)

# train simple model

In [2]:
df_train_emb = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_train_oaiemb.json', 
    lines=True)
df_val_emb = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_val_oaiemb.json', 
    lines=True)
df_test_emb = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_test_oaiemb.json', 
    lines=True)

In [3]:
X_train = np.array(df_train_emb['openai_embedding'].to_list())
X_val = np.array(df_val_emb['openai_embedding'].to_list())
X_test = np.array(df_test_emb['openai_embedding'].to_list())

In [4]:
X_train.shape, X_val.shape, X_test.shape

((26518, 1536), (8839, 1536), (8840, 1536))

In [5]:
label2id = {}
with open('../../data/attribute_extraction_metadata_template/25L2_unfreetext_attribute_name_value_pairs_02232023.txt', 'r') as f:
    for l in f:
        i = l.replace('\n', '')
        if len(i) > 0:
            label2id[i] = len(label2id)

In [6]:
len(label2id)

6875

In [30]:
id2label = {label2id[i]: i for i in label2id}

In [7]:
y_train = np.zeros((len(X_train), len(label2id)))
y_val = np.zeros((len(X_val), len(label2id)))
y_test = np.zeros((len(X_test), len(label2id)))

In [8]:
for ind, i in enumerate(df_train_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_train[ind, label2id[j]] = 1

for ind, i in enumerate(df_val_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_val[ind, label2id[j]] = 1

for ind, i in enumerate(df_test_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_test[ind, label2id[j]] = 1

In [9]:
y_train.sum(1).mean(), y_val.sum(1).mean(), y_test.sum(1).mean()

(0.6673580209668905, 0.6575404457517818, 0.6733031674208145)

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [51]:
# clf = OneVsRestClassifier(RandomForestClassifier())
clf = OneVsRestClassifier(SVC(kernel="linear", probability=True))

In [52]:
clf.fit(X_train, y_train) # 3 hours



In [81]:
from joblib import dump, load
dump(clf, 'query_attrkv_clf_oaiemb_svc.joblib') 

['query_attrkv_clf_oaiemb_svc.joblib']

In [82]:
clf_loaded = load('query_attrkv_clf_oaiemb_svc.joblib')

In [83]:
preds_loaded = clf_loaded.predict(X_test)

In [53]:
preds = clf.predict(X_test)


In [86]:
assert (preds_loaded == preds).all()

In [54]:
preds.sum()

1430

In [57]:
preds.sum(1).mean()

0.16176470588235295

In [55]:
probas = clf.predict_proba(X_test)

In [58]:
probas.argsort(1)

array([[   0, 3649, 6006, ..., 1159,  960, 4414],
       [   0, 3526, 3525, ..., 6337, 2494, 1993],
       [   0, 4028, 4027, ..., 6582, 1176, 1151],
       ...,
       [   0, 4157, 4156, ..., 3806, 3661, 1151],
       [   0, 3769, 3768, ..., 6302, 1161, 1827],
       [   0, 3616, 3615, ..., 6356, 4412, 1169]])

In [59]:
from sklearn.metrics import label_ranking_average_precision_score

In [60]:
label_ranking_average_precision_score(y_test, probas)

0.8031911161314953

In [63]:
report = classification_report(y_test, preds, zero_division=0)
print(report[-500:])

00         0
        6870       0.00      0.00      0.00         0
        6871       0.00      0.00      0.00         0
        6872       0.00      0.00      0.00         0
        6873       0.00      0.00      0.00         0
        6874       0.00      0.00      0.00        16

   micro avg       0.80      0.19      0.31      5952
   macro avg       0.00      0.00      0.00      5952
weighted avg       0.26      0.19      0.21      5952
 samples avg       0.13      0.11      0.12      5952



In [73]:
res = []
for ind, i in enumerate((-probas).argsort(1)[:,:10]):
    res_i = []
    for j in i:
        res_i.append((id2label[j], probas[ind, j]))
    res.append(res_i)

In [76]:
df_test_emb['top_preds'] = res

In [80]:
df_test_emb[['query', 'attr_name_value_pairs_normalized', 'top_preds']].sample(5).to_dict('records')

[{'query': 'realme 5i back cover',
  'attr_name_value_pairs_normalized': [],
  'top_preds': [('Included Components|Protective Case', 0.010202074901125586),
   ('Theme|Wedding and Engagement', 0.008218037599417272),
   ('Department|Women', 0.005537317179324901),
   ('Installation or Mount Compatibility|Wall Mount', 0.0037468586823402),
   ('Theme|Baby', 0.003500092559363201),
   ('Department|Men', 0.003439599094986939),
   ('Primary Color|Black', 0.003186536085666012),
   ('Brand|Motorola', 0.0028616101572652387),
   ('Brand|LG', 0.002737893766047473),
   ('Item Features|Smartphone', 0.0025427083453395233)]},
 {'query': 'pokémon phone cases',
  'attr_name_value_pairs_normalized': [['Theme', 'Cartoon']],
  'top_preds': [('Theme|Anime', 0.173562671074284),
   ('Theme|Cartoon', 0.0184959805216781),
   ('Pattern|Cartoon', 0.013892497492176473),
   ('Theme|Animal', 0.01377347542460196),
   ('Theme|Dragons', 0.008758548779421171),
   ('Theme|Sport', 0.00566508060635851),
   ('Item Features|Sm