In [1]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [22]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2_highimpression.csv',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

In [23]:
len(df_original)

113438

In [24]:
df_original = df_original.dropna(subset=['label_ordering'])

In [25]:
len(df_original)

113433

In [27]:
df_original['label_ordering'] = df_original['label_ordering'].astype(int)

# get attributes

In [4]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [5]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [6]:
attributes = set(df_attributes_group['attribute_field'])

In [7]:
category_paths = set(df_attributes['category'])

In [8]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing appen data

In [46]:
df1 = pd.read_csv('appen/output_batch_correct_v2/product_attribution_02.03.23.csv')
df2 = pd.read_csv('appen/output_batch_correct_v2/product_attribution_03.03.23_pending.csv')
df3 = pd.read_csv('appen/output_batch_correct_v2/product_attribution_03.03.23_rework.csv')

In [47]:
df1 = df1.rename(columns={'Final Answer': 'rater3_output', 'L2_Category': 'l2_category', 'Label_Ordering': 'label_ordering', 'Main_Image_Url': 'main_image_url', 
    'Product_Category': 'product_category', 'Product_Description': 'product_description', 'Product_Id': 'product_id', 'Rater1_Answer': 'rater1_output', 
    'Rater2_Answer': 'rater2_output', 'Sample_Method': 'sample_method', 'Title': 'title'
})

In [48]:
set(df1), set(df2), set(df3)

({'l2_category',
  'label_ordering',
  'main_image_url',
  'product_category',
  'product_description',
  'product_id',
  'rater1_output',
  'rater2_output',
  'rater3_output',
  'sample_method',
  'title'},
 {'l2_category',
  'label_ordering',
  'main_image_url',
  'product_category',
  'product_description',
  'product_id',
  'rater1_no_attributes',
  'rater1_output',
  'rater1_unlisted_value',
  'rater2_no_attributes',
  'rater2_output',
  'rater2_unlisted_value',
  'rater3_no_attributes',
  'rater3_output',
  'rater3_unlisted_value',
  'sample_method',
  'title'},
 {'l2_category',
  'label_ordering',
  'main_image_url',
  'product_category',
  'product_description',
  'product_id',
  'rater1_no_attributes',
  'rater1_output',
  'rater1_unlisted_value',
  'rater2_no_attributes',
  'rater2_output',
  'rater2_unlisted_value',
  'rater3_no_attributes',
  'rater3_output',
  'rater3_unlisted_value',
  'sample_method',
  'title'})

In [49]:
df = pd.concat([df1, df2, df3])

In [50]:
df_uniq = df.drop_duplicates('label_ordering', keep='last')

In [51]:
len(df), len(df_uniq)

(52160, 51295)

In [52]:
df_uniq_merge = df_uniq.merge(df_original[['label_ordering', 'title']].rename(columns={'title': 'title_original'}), on='label_ordering', how='left')

In [53]:
len(df_uniq_merge)

51295

In [54]:
df_uniq_merge[df_uniq_merge.title_original.isna()]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original


In [55]:
df_uniq_merge[df_uniq_merge['title'].apply(lambda x: x.strip()) != df_uniq_merge['title_original'].apply(lambda x: x.strip())]

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original


In [56]:
df_uniq_merge['final_output'] = df_uniq_merge['rater1_output'].fillna('') + '\n' + df_uniq_merge['rater2_output'].fillna('') + '\n' + \
    df_uniq_merge['rater3_output'].fillna('')

In [83]:
correct = []
errors = []
nonempty = []
empty = []
for i in df_uniq_merge.to_dict('records'):
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
        if len(res) > 0:
            nonempty.append(i)
len(df), len(correct), len(errors), len(empty), len(nonempty), len(correct) + len(errors)

(52160, 43774, 7521, 5698, 38076, 51295)

In [85]:
len(empty) / len(correct), len(nonempty) / len(correct), len(errors) / len(df)

(0.13016859322885732, 0.8698314067711427, 0.1441909509202454)

In [86]:
df_correct = pd.DataFrame(correct)
df_error = pd.DataFrame(errors)

In [87]:
df_nonempty = pd.DataFrame(nonempty)
df_empty = pd.DataFrame(empty)

In [88]:
assert len(df_correct) == len(df_empty) + len(df_nonempty)

In [92]:
del df_empty['final_output']

In [95]:
del df_error['final_output']

In [96]:
len(df_empty), len(df_nonempty), len(df_error)

(5698, 38076, 7521)

In [100]:
df_empty['explanation'] = 'Obvious attribute name value pairs are being missed. Such recall problem during annotation would corrupt models trained on it. Thus leading to invalid model.'

In [102]:
df_nonempty.to_csv('appen/output_batch_correct_v3/product_attribution_till_030923_valid_units.csv', index=False)
df_empty.to_csv('appen/output_batch_correct_v3/product_attribution_till_030923_empty_units.csv', index=False)
df_error.to_csv('appen/output_batch_correct_v3/product_attribution_till_030923_invalid_units.csv', index=False)

In [103]:
df_empty.head(1)

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,explanation
0,7,only_text,610ca538c193826a71e173cc,Kitchen Stainless Steel Multi-purpose Portable...,Description: \nMaterial:Stainless steel\nColor...,,"Home & Garden > Kitchen,Dining & Bar > Flatwar...","Home & Garden > Kitchen,Dining & Bar",,,,,,,,,,Kitchen Stainless Steel Multi-purpose Portable...,Obvious attribute name value pairs are being m...


In [104]:
df_nonempty.head(1)

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,final_output
0,0,only_text,611bbb365b0bd8698b670d9d,"Rattan Basket Pet Dome and Animal Bed, with Me...",Features\n- Rattan wicker brings a classic sty...,,Home & Garden > Pet Products > Cat Supplies > ...,Home & Garden > Pet Products,,Home & Garden > Pet Products > Cat Supplies > ...,Home & Garden > Pet Products > Cat Supplies > ...,,,,,,,"Rattan Basket Pet Dome and Animal Bed, with Me...",\nHome & Garden > Pet Products > Cat Supplies ...


In [105]:
df_error.head(1)

Unnamed: 0,label_ordering,sample_method,product_id,title,product_description,main_image_url,product_category,l2_category,rater1_output,rater2_output,rater3_output,rater1_no_attributes,rater1_unlisted_value,rater2_no_attributes,rater2_unlisted_value,rater3_no_attributes,rater3_unlisted_value,title_original,explanation
0,26,only_text,6008655ca011de66ae27e889,New Fashion Hannah Rose Dinner Cloth Napkin 20...,"Materials:100% Polyester,Colorful and Comforta...",,Home & Garden > Home Textile > Table & Sofa Li...,Home & Garden > Home Textile,Home & Garden > Home Textile > Table & Sofa Li...,Home & Garden > Home Textile > Table & Sofa Li...,Home & Garden > Home Textile > Table & Sofa Li...,,,,,,,New Fashion Hannah Rose Dinner Cloth Napkin 20...,Home & Garden > Home Textile > Table & Sofa Li...
