In [12]:
import pandas as pd
import json
from tqdm import tqdm
import numpy as np
from joblib import dump, load

In [13]:
from openai.embeddings_utils import get_embedding
engine = 'text-embedding-ada-002'
len(get_embedding('text', engine=engine))

1536

In [14]:
df_exist = pd.concat([
    pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_train_oaiemb.json', lines=True),
    pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_val_oaiemb.json', lines=True),
    pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_delivered_022123_validated_query_attr_test_oaiemb.json', lines=True)
])

In [15]:
df_train = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_train.json', 
    lines=True)
df_val = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_val.json', 
    lines=True)
df_test = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_test.json', 
    lines=True)

In [16]:
all_queries = set(df_train['query']).union(set(df_val['query'])).union(set(df_test['query']))

In [17]:
set(df_exist['query']) - all_queries

set()

In [18]:
len(set(df_exist['query'])), len(all_queries)

(44197, 123653)

In [19]:
recs_leftover = []
with open('appen_tolabel_query_attr_leftover.json', 'r') as f:
    for l in tqdm(f):
        d = json.loads(l)
        if d['query'] in all_queries:
            recs_leftover.append(d)

1204it [00:01, 1105.98it/s]


KeyboardInterrupt: 

In [None]:
df_leftover = pd.DataFrame(recs_leftover)

In [None]:
df_embs = pd.concat([
    df_exist[['query', 'openai_embedding']],
    df_leftover[['query', 'openai_embedding']]
]).drop_duplicates('query')

In [None]:
len(df_embs) / len(all_queries)

0.9998786927935432

In [None]:
df_train_emb = df_train.merge(df_embs, on='query', how='left')
df_val_emb = df_val.merge(df_embs, on='query', how='left')
df_test_emb = df_test.merge(df_embs, on='query', how='left')

In [None]:
df_train_emb.loc[df_train_emb.openai_embedding.isna(), 'openai_embedding'] = df_train_emb.loc[df_train_emb.openai_embedding.isna(), 'query'].apply( 
    lambda x: get_embedding(x, engine=engine))
df_val_emb.loc[df_val_emb.openai_embedding.isna(), 'openai_embedding'] = df_val_emb.loc[df_val_emb.openai_embedding.isna(), 'query'].apply( 
    lambda x: get_embedding(x, engine=engine))
df_test_emb.loc[df_test_emb.openai_embedding.isna(), 'openai_embedding'] = df_test_emb.loc[df_test_emb.openai_embedding.isna(), 'query'].apply( 
    lambda x: get_embedding(x, engine=engine))

In [None]:
df_train_emb[df_train_emb.openai_embedding.isna()], df_val_emb[df_val_emb.openai_embedding.isna()], df_test_emb[df_test_emb.openai_embedding.isna()]

(Empty DataFrame
 Columns: [label_ordering, sample_method, query, category, Rater_Answer, attr_name_value_pairs_normalized, attr_name_value_pairs_custom, attr_name_value_pairs_normalized_text, openai_embedding]
 Index: [],
 Empty DataFrame
 Columns: [label_ordering, sample_method, query, category, Rater_Answer, attr_name_value_pairs_normalized, attr_name_value_pairs_custom, attr_name_value_pairs_normalized_text, openai_embedding]
 Index: [],
 Empty DataFrame
 Columns: [label_ordering, sample_method, query, category, Rater_Answer, attr_name_value_pairs_normalized, attr_name_value_pairs_custom, attr_name_value_pairs_normalized_text, openai_embedding]
 Index: [])

In [41]:
df_train_emb.to_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_train_oaiemb.json', 
    lines=True, orient='records')
df_val_emb.to_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_val_oaiemb.json', 
    lines=True, orient='records')
df_test_emb.to_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_test_oaiemb.json', 
    lines=True, orient='records')

# train simple model

In [20]:
df_train_emb = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_train_oaiemb.json', 
    lines=True)
df_val_emb = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_val_oaiemb.json', 
    lines=True)
df_test_emb = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/query_attr_extract_label/processed2/appen_020323_030323_delivered_030623_validated_query_attr_test_oaiemb.json', 
    lines=True)

In [21]:
X_train = np.array(df_train_emb['openai_embedding'].to_list())
X_val = np.array(df_val_emb['openai_embedding'].to_list())
X_test = np.array(df_test_emb['openai_embedding'].to_list())

In [22]:
X_train.shape, X_val.shape, X_test.shape

((74191, 1536), (24731, 1536), (24731, 1536))

In [23]:
label2id = {}
with open('../../data/attribute_extraction_metadata_template/25L2_unfreetext_attribute_name_value_pairs_02232023.txt', 'r') as f:
    for l in f:
        i = l.replace('\n', '')
        if len(i) > 0:
            label2id[i] = len(label2id)

In [24]:
len(label2id)

6875

In [25]:
id2label = {label2id[i]: i for i in label2id}

In [26]:
y_train = np.zeros((len(X_train), len(label2id)))
y_val = np.zeros((len(X_val), len(label2id)))
y_test = np.zeros((len(X_test), len(label2id)))

In [27]:
for ind, i in enumerate(df_train_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_train[ind, label2id[j]] = 1

for ind, i in enumerate(df_val_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_val[ind, label2id[j]] = 1

for ind, i in enumerate(df_test_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_test[ind, label2id[j]] = 1

In [28]:
y_train.sum(1).mean(), y_val.sum(1).mean(), y_test.sum(1).mean()

(0.4466849078729226, 0.45072985322065423, 0.4504872427317941)

In [29]:
y_test.sum(1).max()

9.0

In [30]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import (classification_report, accuracy_score, 
    label_ranking_average_precision_score)

In [None]:
clf = OneVsRestClassifier(estimator=SVC(kernel="linear", probability=True), n_jobs=-1, verbose=1)
clf.fit(X_train, y_train) # 8 hours

In [62]:
# dump(clf, 'simple_models/query_attrkv_clf_oaiemb_svc_v2.joblib') 

['simple_models/query_attrkv_clf_oaiemb_svc_v2.joblib']

In [31]:
probas = clf.predict_proba(X_test)

In [32]:
label_ranking_average_precision_score(y_test, probas)

0.8620175384326433

In [33]:
non_empty_idx = y_test.sum(1) > 0

In [34]:
label_ranking_average_precision_score(y_test[non_empty_idx], probas[non_empty_idx])

0.6488893654673905

In [65]:
preds = clf.predict(X_test)

In [66]:
report = classification_report(y_test, preds, zero_division=0)
print(report[-500:])

00         0
        6870       0.00      0.00      0.00         0
        6871       0.00      0.00      0.00         0
        6872       0.00      0.00      0.00         1
        6873       0.00      0.00      0.00         0
        6874       0.00      0.00      0.00        29

   micro avg       0.67      0.24      0.35     11141
   macro avg       0.00      0.00      0.00     11141
weighted avg       0.25      0.24      0.23     11141
 samples avg       0.10      0.09      0.09     11141



In [71]:
report = classification_report(y_test, probas > 0.2, zero_division=0)
print(report[-500:])

00         0
        6870       0.00      0.00      0.00         0
        6871       0.00      0.00      0.00         0
        6872       0.00      0.00      0.00         1
        6873       0.00      0.00      0.00         0
        6874       0.48      0.34      0.40        29

   micro avg       0.56      0.44      0.49     11141
   macro avg       0.02      0.02      0.02     11141
weighted avg       0.43      0.44      0.42     11141
 samples avg       0.17      0.17      0.17     11141



In [35]:
res = []
for ind, i in enumerate((-probas).argsort(1)[:,:10]):
    res_i = []
    for j in i:
        res_i.append((id2label[j], probas[ind, j]))
    res.append(res_i)

In [74]:
df_test_emb['top_preds'] = res

In [75]:
df_test_emb[['query', 'attr_name_value_pairs_normalized', 'top_preds']].sample(5).to_dict('records')

[{'query': 'copri scarpe pioggia',
  'attr_name_value_pairs_normalized': [],
  'top_preds': [('Department|Women', 0.018660112003396904),
   ('Primary Color|Multicolor', 0.011761106948106627),
   ('Included Components|Protective Case', 0.008807595703816501),
   ('Sport or Activity Type|Fishing', 0.008757518099708777),
   ('Department|Dogs', 0.008660792034879027),
   ('Water Resistance Level|Waterproof', 0.0074308194002256946),
   ('Primary Color|Black', 0.0054832199945372626),
   ('Shoe Type|Ankle Boot', 0.005282173029696973),
   ('Theme|Animals', 0.005254524551960455),
   ('Primary Color|White', 0.003893629332394848)]},
 {'query': 'scarpe firmate uomo',
  'attr_name_value_pairs_normalized': [],
  'top_preds': [('Department|Men', 0.703696447666589),
   ('Department|Women', 0.05870169287225553),
   ('Primary Color|Black', 0.01563355135644077),
   ('Heel Type|Flat', 0.012238306557035289),
   ('Shoe Height|High Top', 0.010150855588023299),
   ('Materials|Leather', 0.009008516066426072),
  

In [39]:
emb_i = np.array([get_embedding('gold and pink phone charm', engine=engine)])

In [40]:
probas = clf.predict_proba(emb_i)
res = []
for ind, i in enumerate((-probas).argsort(1)[:,:10]):
    res_i = []
    for j in i:
        res_i.append((id2label[j], probas[ind, j]))
    res.append(res_i)

In [41]:
res

[[('Primary Color|Pink', 0.8706981997222492),
  ('Primary Color|Gold', 0.39582361860211335),
  ('Materials|Gold', 0.0554972406148032),
  ('Theme|Flowers', 0.01637651667555662),
  ('Theme|Cartoon', 0.013148701149193396),
  ('Materials|Rose Gold', 0.013014805449244358),
  ('Department|Women', 0.01136299659795209),
  ('Primary Color|Multicolor', 0.008781224025904304),
  ('Materials|Glass', 0.004119989731698643),
  ('Theme|Princess', 0.0038509526053711767)]]