In [97]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [98]:
df_missed = pd.concat([
    pd.read_csv('appen/output_batch_correct_v5/product_attribution_missed_032423.csv'), 
    pd.read_csv('appen/output_batch_correct_v6/product_attribution_missed_just_032723.csv')
])

In [99]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2_highimpression.csv',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('datasets/data/wish_attr_extract_label/appen/input_batch_processed/appen_product_attribution_batch2.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

In [100]:
len(df_missed)

7861

In [101]:
len(df_original)

113438

In [102]:
df_missed = df_missed.dropna(subset=['label_ordering'])

In [103]:
df_original = df_original.dropna(subset=['label_ordering'])

In [104]:
len(df_missed)

7861

In [105]:
len(df_original)

113433

In [106]:
df_original['label_ordering'] = df_original['label_ordering'].astype(int)
df_missed['label_ordering'] = df_missed['label_ordering'].astype(int)

In [107]:
set(df_missed['label_ordering']) - set(df_original['label_ordering'])

set()

# get attributes

In [108]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [109]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [110]:
attributes = set(df_attributes_group['attribute_field'])

In [111]:
category_paths = set(df_attributes['category'])

In [112]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing appen data

In [13]:
# df1 = pd.read_csv('appen/output_batch_correct_v3/product_attribution_till_030923_valid_units.csv')
# df2 = pd.read_csv('appen/output_batch_correct_v4/f2072154_f2075355_post_03.13.23_cleaned_invalid.csv')
# df3 = pd.read_csv('appen/output_batch_correct_v4/f2072154_f2075355_post_03.13.23_cleaned_valid-empty units.csv')
# df4 = pd.read_csv('appen/output_batch_correct_v4/Product Attribution_03.10.23_Batch2_highimpression.csv')
# df5 = pd.read_csv('appen/output_batch_correct_v4/product_attribution_03.03.23_invalid_units_03.02.23_Rework.csv')

In [14]:
# len(df1), len(df2), len(df3), len(df4), len(df5)

(38076, 866, 19678, 10808, 419)

In [15]:
# df = pd.concat([df1, df2, df3, df4, df5])

In [113]:
df = pd.read_csv('appen/output_batch_correct_v7/7k_Appen.csv')

In [114]:
df_uniq = df.drop_duplicates('label_ordering', keep='first')

In [115]:
len(df), len(df_uniq)

(7789, 7789)

In [116]:
df_uniq_merge = df_uniq.merge(df_original[['label_ordering', 'title', 'product_description']].rename(
    columns={'title': 'title_original', 'product_description': 'product_description_original'}), on='label_ordering', how='left')

In [117]:
len(df_uniq_merge)

7789

In [118]:
df_uniq_merge[df_uniq_merge.title != df_uniq_merge.title_original]

Unnamed: 0,category_path,product_id,title,product_description,main_image_url,sample_method,label_ordering,rater1_no_attributes,rater1_unlisted_value,rater1_output,rater2_no_attributes,rater2_unlisted_value,rater2_output,rater3_no_attributes,rater3_unlisted_value,rater3_output,title_original,product_description_original


In [119]:
df_uniq_merge[df_uniq_merge.title_original.isna()]

Unnamed: 0,category_path,product_id,title,product_description,main_image_url,sample_method,label_ordering,rater1_no_attributes,rater1_unlisted_value,rater1_output,rater2_no_attributes,rater2_unlisted_value,rater2_output,rater3_no_attributes,rater3_unlisted_value,rater3_output,title_original,product_description_original


In [120]:
df_uniq_merge[df_uniq_merge.product_description_original.isna()]

Unnamed: 0,category_path,product_id,title,product_description,main_image_url,sample_method,label_ordering,rater1_no_attributes,rater1_unlisted_value,rater1_output,rater2_no_attributes,rater2_unlisted_value,rater2_output,rater3_no_attributes,rater3_unlisted_value,rater3_output,title_original,product_description_original
4044,"Home & Garden > Kitchen,Dining & Bar > Teaware...",6120aa90d631eeaff38fd328,3 Style New Concept Training Hidden Happiness Cup,,https://canary.contestimg.wish.com/api/webimag...,text_and_img,35818,True,0,0,True,0,0,True,0,0,3 Style New Concept Training Hidden Happiness Cup,


In [121]:
df_uniq_merge['final_output'] = df_uniq_merge['rater1_output'].fillna('') + '\n' + df_uniq_merge['rater2_output'].fillna('') + '\n' + \
    df_uniq_merge['rater3_output'].fillna('')

In [122]:
correct = []
errors = []
errors_fixed = []
nonempty = []
empty = []
for i in df_uniq_merge.to_dict('records'):
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
        if len(res) > 0:
            nonempty.append(i)
for i in errors:
    res = [] if ((not isinstance(i['final_output'], str)) and (np.isnan(i['final_output']))) else i['final_output'].split('\n')
    res = sorted(set(res))
    res = [i for i in res if len(i) > 0]
    corrected_res = []
    for j in res:
        if len(j.split(' > ')) >= 2 and j.split(' > ')[-2] in attributes:
            corrected_res.append(j)
    i['final_output_corrected'] = '\n'.join(corrected_res)
    errors_fixed.append(i)

len(df), len(correct), len(errors), len(errors_fixed), len(empty), len(nonempty), len(correct) + len(errors)

(7789, 0, 7789, 7789, 0, 0, 7789)

In [123]:
df_error_fixed = pd.DataFrame(errors_fixed)

In [124]:
df_error_fixed[['title', 'final_output_corrected']].sample(2).to_dict('records')

[{'title': 'Apple AirPods AirPods Wireless Earphone Bluetooth Mic MMEF2J/A',
  'final_output_corrected': 'Consumer Electronics > Earphones & Headphones > Earphones > Compatible Devices > Cellphones\nConsumer Electronics > Earphones & Headphones > Earphones > Item Condition > New\nConsumer Electronics > Earphones & Headphones > Earphones > Noise Control > None\nConsumer Electronics > Earphones & Headphones > Earphones > Primary Color > White\nConsumer Electronics > Earphones & Headphones > Earphones > Wireless Communication Types > Bluetooth'},
 {'title': 'Skull and Roses Hippie Spare Tire Cover fit to exact tire size Jeep Camper RV Motor home Trailer/Option for backup camera in menu',
  'final_output_corrected': 'Home & Garden > Home Textile > Table & Sofa Linens > Chair Covers > Materials > Polyester\nHome & Garden > Home Textile > Table & Sofa Linens > Chair Covers > Size > 14 inch 23inch 27inch\nHome & Garden > Home Textile > Table & Sofa Linens > Chair Covers > Size > 16 inch 30inc

In [125]:
len(set(df_missed['label_ordering']) - set(df_error_fixed['label_ordering']))

72

In [126]:
df_error_fixed.to_csv('appen/output_batch_correct_v7/product_attribution_missed_032423_just_032723_invalid_units_fixed.csv', index=False)

In [127]:
leftover = set(df_missed['label_ordering']) - set(df_error_fixed['label_ordering'])
df_leftover = df_original[df_original.label_ordering.apply(lambda x: x in leftover)]

In [128]:
len(df_leftover)

72

In [129]:
df_leftover.to_csv('appen/output_batch_correct_v7/product_attribution_missed_033023.csv', index=False)