In [1]:
from openai.embeddings_utils import get_embedding
import pandas as pd
from tqdm import tqdm
import numpy as np
from joblib import dump, load
import json

In [2]:
# embedding download in write_appen_product_embedding_to_file.py

# train simple model

In [3]:
df_emb = pd.read_json('appen_tolabel_product_attr_leftover.json', lines=True).dropna(subset=['product_id'])

In [4]:
assert len(set(df_emb['product_id'])) == len(df_emb)

In [5]:
df_train = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_delivered_022123_validated_product_attr_textonly_train.json', lines=True)
df_val = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_delivered_022123_validated_product_attr_textonly_val.json', lines=True)
df_test = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/wish_attr_extract_label/processed/appen_020323_delivered_022123_validated_product_attr_textonly_test.json', lines=True)

In [6]:
assert len(set(df_train['pid'])) == len(df_train)
assert len(set(df_val['pid'])) == len(df_val)
assert len(set(df_test['pid'])) == len(df_test)

In [7]:
df_emb.head(2)

Unnamed: 0,openai_embedding,text,product_id,label_ordering
0,"[0.011446514166891, 0.005374236498028, 0.00731...",Product Title: Rattan Basket Pet Dome and Anim...,611bbb365b0bd8698b670d9d,0.0
1,"[0.0015560899628320002, -0.004062985070049, -0...",Product Title: 10Pcs 7Pcs 2Pcs 1Pcs Tempered G...,61a9c0160dcaf9e1da138df1,1.0


In [8]:
df_train.head(2)

Unnamed: 0,label_ordering,sample_method,pid,category,title,description,main_img_url,Final_Answer,attr_name_value_pairs_normalized,attr_name_value_pairs_custom,attr_name_value_pairs_normalized_text
0,18180,only_text,60e6d0ef0fa75280dc24db5f,Shoes > Women's Shoes > Heels > Middle Heels,Sandals New Female Velcro Open Toe Net Red Thi...,Closed way: magic sticker heel high: heel head...,,,[],[],
1,11745,only_text,6188cc8b03d753eb729ad93b,Home & Garden > Festive & Party Supplies > Chr...,1 Set LOVE Letter Balloon Elastic Decorative ...,Specifications: \nA pleasure ambience usually ...,,Home & Garden > Festive & Party Supplies > Chr...,"[[Materials, Aluminum], [Primary Color, Silver]]","[[Number of Ornaments, 1]]",Materials|Aluminum\nPrimary Color|Silver


In [9]:
len(df_train), len(df_val), len(df_test)

(7507, 2502, 2503)

In [11]:
df_train_emb = df_train.merge(df_emb[['product_id', 'openai_embedding']].rename(columns={'product_id': 'pid'}), how='inner', on='pid')
df_val_emb = df_val.merge(df_emb[['product_id', 'openai_embedding']].rename(columns={'product_id': 'pid'}), how='inner', on='pid')
df_test_emb = df_test.merge(df_emb[['product_id', 'openai_embedding']].rename(columns={'product_id': 'pid'}), how='inner', on='pid')

In [12]:
len(df_train_emb), len(df_val_emb), len(df_test_emb)

(7505, 2501, 2503)

In [14]:
X_train = np.array(df_train_emb['openai_embedding'].to_list())
X_val = np.array(df_val_emb['openai_embedding'].to_list())
X_test = np.array(df_test_emb['openai_embedding'].to_list())

In [15]:
label2id = {}
with open('../../data/attribute_extraction_metadata_template/25L2_unfreetext_attribute_name_value_pairs_02232023.txt', 'r') as f:
    for l in f:
        i = l.replace('\n', '')
        if len(i) > 0:
            label2id[i] = len(label2id)

In [16]:
len(label2id)

6875

In [17]:
id2label = {label2id[i]: i for i in label2id}

In [18]:
y_train = np.zeros((len(X_train), len(label2id)))
y_val = np.zeros((len(X_val), len(label2id)))
y_test = np.zeros((len(X_test), len(label2id)))

In [19]:
for ind, i in enumerate(df_train_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_train[ind, label2id[j]] = 1

for ind, i in enumerate(df_val_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_val[ind, label2id[j]] = 1

for ind, i in enumerate(df_test_emb['attr_name_value_pairs_normalized_text'].to_list()):
    for j in i.split('\n'):
        if j in label2id:
            y_test[ind, label2id[j]] = 1

In [20]:
y_train.sum(1).mean(), y_val.sum(1).mean(), y_test.sum(1).mean()

(3.1199200532978013, 3.143142742902839, 3.157011586096684)

In [21]:
y_test.sum(1).max()

35.0

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
clf = OneVsRestClassifier(SVC(kernel="linear", probability=True))
clf.fit(X_train, y_train) # 3 hours
dump(clf, 'product_attrkv_clf_oaiemb_svc.joblib') # 1hr

# evaluate simple model

In [24]:
clf_loaded = load('product_attrkv_clf_oaiemb_svc.joblib')

In [25]:
preds = clf_loaded.predict(X_test)

In [26]:
preds.sum()

282

In [27]:
preds.sum(1).mean()

0.11266480223731522

In [28]:
probas = clf_loaded.predict_proba(X_test)

In [29]:
from sklearn.metrics import label_ranking_average_precision_score

In [30]:
label_ranking_average_precision_score(y_test, probas) # svc

0.6344327129287932

In [31]:
report = classification_report(y_test, preds, zero_division=0)
print(report[-500:]) # svc

00         0
        6870       0.00      0.00      0.00         0
        6871       0.00      0.00      0.00         0
        6872       0.00      0.00      0.00         0
        6873       0.00      0.00      0.00         0
        6874       0.00      0.00      0.00        10

   micro avg       0.77      0.03      0.05      7902
   macro avg       0.00      0.00      0.00      7902
weighted avg       0.13      0.03      0.04      7902
 samples avg       0.05      0.02      0.02      7902



In [33]:
res = []
for ind, i in enumerate((-probas).argsort(1)[:,:100]):
    res_i = []
    for j in i:
        res_i.append((id2label[j], probas[ind, j]))
    res.append(res_i)

In [34]:
df_test_emb['top_preds'] = res

In [38]:
df_test_emb[['title', 'description', 'top_preds']].sample(1).to_dict('records') # svc

[{'title': 'Construction Truck Birthday Party Decoration Excavator Balloon Truck Pull Flag Children Birthday Party Decoration Balloon Decoration Happy Birthday Banner Pull Flag',
  'description': 'EASY TO ASSEMBLE: the banners and balloons come with tools and balloons glue, very easy to assemble, it would be a happy family craft for fun to assemble with your kids and decorate the party\nDESIGN: The highly detailed design of our construction party happy birthday banner, vehicle banner, traffic signs cutouts, balloons and cake toppers will make your child feel the atmosphere of a busy construction zone right inside the party. Our construction party supplies are guaranteed to create unforgettable memories for your child, family and guests\nHIGH QUALITY: The balloons are crafted of durable and strong latex, the cupcake toppers, banners and traffic signs cutouts are made of sturdy heavy duty card stock paper and can easily be reused for multiple parties\nKITS AND PERFECT DECOR: To throw a c