In [1]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [29]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2_highimpression.csv',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    # pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2.csv', 
        # repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

In [31]:
len(df_original)

62142

In [32]:
df_original = df_original.dropna(subset=['label_ordering'])

In [33]:
len(df_original)

62137

In [34]:
df_original['label_ordering'] = df_original['label_ordering'].astype(int)

# get attributes

In [8]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [9]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [10]:
attributes = set(df_attributes_group['attribute_field'])

In [11]:
category_paths = set(df_attributes['category'])

In [12]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing appen data

In [35]:
df1 = pd.read_csv('appen/output_batch_correct_v3/product_attribution_till_030923_valid_units.csv')
df2 = pd.read_csv('appen/output_batch_correct_v4/f2072154_f2075355_post_03.13.23_cleaned_invalid.csv')
df3 = pd.read_csv('appen/output_batch_correct_v4/f2072154_f2075355_post_03.13.23_cleaned_valid-empty units.csv')
df4 = pd.read_csv('appen/output_batch_correct_v4/Product Attribution_03.10.23_Batch2_highimpression.csv')
df5 = pd.read_csv('appen/output_batch_correct_v4/product_attribution_03.03.23_invalid_units_03.02.23_Rework.csv')

In [36]:
len(df1), len(df2), len(df3), len(df4), len(df5)

(38076, 866, 19678, 10808, 419)

In [37]:
df = pd.concat([df1, df2, df3, df4, df5])

In [38]:
df_uniq = df.drop_duplicates('label_ordering', keep='first')

In [39]:
len(df), len(df_uniq)

(69847, 54314)

In [50]:
df_uniq_merge = df_uniq.merge(df_original[['label_ordering', 'title']].rename(columns={'title': 'title_original2'}), on='label_ordering', how='left')

In [51]:
len(df_uniq_merge)

54314

In [54]:
df_uniq_merge[df_uniq_merge.title_original2.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output,title_original2


In [55]:
df_uniq_merge[df_uniq_merge['title'].apply(lambda x: x.strip()) != df_uniq_merge['title_original2'].apply(lambda x: x.strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output,title_original2
46846,162392,text_and_img,5cdd05b52e33a70fe773d613,Lixada Breathable Fishing Life Vest 209lb Bu...,This fishing Life jacket looks more like an ou...,https://canary.contestimg.wish.com/api/webimag...,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing > Fishing Apparel > Fishing V...,Sports > Fishing > Fishing Apparel > Fishing V...,,,,True,,,,,​Lixada Breathable Fishing Life Vest 209lb B...


In [57]:
df_uniq_merge.loc[df_uniq_merge.title_original.isna(), 'title_original'] = df_uniq_merge.loc[df_uniq_merge.title_original.isna(), 'title_original2']
del df_uniq_merge['title_original2']

In [61]:
df_uniq_merge['final_output'] = df_uniq_merge['rater1_output'].fillna('') + '\n' + df_uniq_merge['rater2_output'].fillna('') + '\n' + \
    df_uniq_merge['rater3_output'].fillna('')

In [62]:
correct = []
errors = []
errors_fixed = []
nonempty = []
empty = []
for i in df_uniq_merge.to_dict('records'):
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
        if len(res) > 0:
            nonempty.append(i)
len(df), len(correct), len(errors), len(empty), len(nonempty), len(correct) + len(errors)

(69847, 51025, 3289, 2771, 48254, 54314)

In [63]:
len(empty) / len(correct), len(nonempty) / len(correct), len(errors) / len(df)

(0.05430671239588437, 0.9456932876041156, 0.047088636591406934)

In [64]:
df_correct = pd.DataFrame(correct)
df_error = pd.DataFrame(errors)

In [65]:
df_nonempty = pd.DataFrame(nonempty)
df_empty = pd.DataFrame(empty)

In [66]:
assert len(df_correct) == len(df_empty) + len(df_nonempty)

In [68]:
df_empty.rater3_no_attributes.all()

True

In [70]:
len(df_empty), len(df_nonempty), len(df_error)

(2771, 48254, 3289)

In [72]:
df_nonempty.to_csv('appen/output_batch_correct_v5/product_attribution_till_032423_valid_units.csv', index=False)
df_empty.to_csv('appen/output_batch_correct_v5/product_attribution_till_032423_empty_units.csv', index=False)
df_error.to_csv('appen/output_batch_correct_v5/product_attribution_till_032423_invalid_units.csv', index=False)

In [73]:
df_empty.head(1)

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output
0,7,only_text,610ca538c193826a71e173cc,Kitchen Stainless Steel Multi-purpose Portable...,Description: \nMaterial:Stainless steel\nColor...,,"Home & Garden > Kitchen,Dining & Bar > Flatwar...","Home & Garden > Kitchen,Dining & Bar",,,,True,,True,,True,,Kitchen Stainless Steel Multi-purpose Portable...,\n\n


In [74]:
df_nonempty.head(1)

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output
0,0,only_text,611bbb365b0bd8698b670d9d,"Rattan Basket Pet Dome and Animal Bed, with Me...",Features\n- Rattan wicker brings a classic sty...,,Home & Garden > Pet Products > Cat Supplies > ...,Home & Garden > Pet Products,,Home & Garden > Pet Products > Cat Supplies > ...,Home & Garden > Pet Products > Cat Supplies > ...,,,,,,,"Rattan Basket Pet Dome and Animal Bed, with Me...",\nHome & Garden > Pet Products > Cat Supplies ...


In [75]:
df_error.head(1)

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output,explanation
0,19,text_and_img,615c3319a6d67def92936931,Retro New Fashion Thick Chain Women Mini Flap ...,&quot;Features\n\nColor: as shown\nSize: The u...,https://canary.contestimg.wish.com/api/webimag...,Luggage & Bags > Women's Bags > Crossbody Bags,Luggage & Bags > Women's Bags,Luggage & Bags > Women's Bags > Crossbody Bags...,Luggage & Bags > Women's Bags\nLuggage & Bags ...,Luggage & Bags > Women's Bags > Crossbody Bags...,,,,True,,,Retro New Fashion Thick Chain Women Mini Flap ...,Luggage & Bags > Women's Bags > Crossbody Bags...,Luggage & Bags > Women's Bags has invalid attr...


In [78]:
leftover = set(df_original.label_ordering) - set(df_uniq_merge.label_ordering)
df_leftover = df_original[df_original.label_ordering.apply(lambda x: x in leftover)]

In [81]:
len(df_leftover)

7823

In [80]:
df_leftover.to_csv('appen/output_batch_correct_v5/product_attribution_missed_032423.csv', index=False)