In [43]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [9]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('modelling/notebooks/query_attr_extract_appen_label/appen_query_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('modelling/notebooks/query_attr_extract_appen_label/appen_query_attribution_batch2.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')),
    pd.read_csv(dvc.api.get_url('modelling/notebooks/query_attr_extract_appen_label/appen_query_attribution_batch3_top74419.csv',
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

In [3]:
len(df_original)

324419

In [4]:
len(set(df_original['label_ordering']))

324419

In [39]:
df_original.head(2)

Unnamed: 0,query,sample_method,label_ordering,top_query_classification_taxonomy
0,dog cat eater,uniform,0,Home & Garden > Pet Products > Cat Supplies > ...
1,fish holder tool,head,18,Sports > Fishing > Fishing Tools


# get attributes

In [10]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [11]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [12]:
attributes = set(df_attributes_group['attribute_field'])

In [13]:
category_paths = set(df_attributes['category'])

In [14]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing valid data

In [32]:
df1 = pd.read_csv('appen/output_batch_correct_v3/query_attribution_till_030923_valid_units.csv')

  df1 = pd.read_csv('appen/output_batch_correct_v3/query_attribution_till_030923_valid_units.csv')


In [33]:
del df1['Unnamed: 0']

In [34]:
df2 = pd.read_csv('appen/output_batch_correct_v4/Rework_processed_96,900 units.csv')
df3 = pd.read_csv('appen/output_batch_correct_v4/New Volume_processed_64,136 units.csv')
df4 = pd.read_csv('appen/output_batch_correct_v4/Query Attribution_03.17.23_10k.csv')
df5 = pd.read_csv('appen/output_batch_correct_v4/Query Attribution_03.24.23_3k.csv')

  df2 = pd.read_csv('appen/output_batch_correct_v4/Rework_processed_96,900 units.csv')


In [35]:
df = pd.concat([df1, df2, df3, df4, df5])

In [36]:
len(df)

324009

In [37]:
len(set(df['label_ordering']))

324009

In [56]:
df_merge = df.merge(df_original.rename(columns={'query': 'query_original2', 'sample_method': 'sample_method_original', 
                             'top_query_classification_taxonomy': 'top_query_classification_taxonomy_original'}), on='label_ordering')

In [57]:
df_merge.loc[df_merge.query_original.isna(), 'query_original'] = df_merge.loc[df_merge.query_original.isna(), 'query_original2']

In [58]:
(df_merge['sample_method'] == df_merge['sample_method_original']).all()

True

In [59]:
df_merge[df_merge['query'].str.strip() != df_merge['query_original'].str.strip()].sample(2)

Unnamed: 0,label_ordering,sample_method,l2_category,query_category,rater_output,query,translated_query,no_attributes,unlisted_value,query_original,query_original2,sample_method_original,top_query_classification_taxonomy_original
308738,6435025,uniform,Home & Garden > Home Textile,Home & Garden > Home Textile > Bedding > Blankets,,makita 18 v skins,,True,,makita 18 v skins￼,makita 18 v skins￼,uniform,Home & Garden > Home Textile > Bedding > Blankets
168379,928985,head,"Home & Garden > Arts, Crafts & Sewing","Home & Garden > Arts, Crafts & Sewing > Needle...",,diamond artdallas cowboys,,True,,diamond artdallas cowboys￼,diamond artdallas cowboys￼,head,"Home & Garden > Arts, Crafts & Sewing > Needle..."


In [60]:
(df_merge['query_category'] == df_merge['top_query_classification_taxonomy_original']).all()

True

In [61]:
for i in ['query_original', 'query_original2', 'sample_method_original', 'top_query_classification_taxonomy_original']:
    del df_merge[i]

In [63]:
leftover_ids = set(df_original['label_ordering']) - set(df_merge['label_ordering'])

In [65]:
df_leftover = df_original[df_original.label_ordering.apply(lambda x: x in leftover_ids)]

In [86]:
len(df_leftover)

410

In [69]:
df_leftover.to_csv('appen/output_batch_correct_v5/query_attribution_missed_032423.csv', index=False)

In [72]:
correct = []
errors = []
empty = []
for i in df_merge.to_dict('records'):
    res = [] if ((not isinstance(i['rater_output'], str)) and (np.isnan(i['rater_output']))) else i['rater_output'].split('\n')
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
len(df), len(correct), len(errors), len(empty), len(correct) + len(errors)

(324009, 324009, 0, 129964, 324009)

In [73]:
len(empty) / len(df_merge)

0.40111231478137954

In [84]:
len(df_merge)

324009

In [85]:
df_merge.to_csv('appen/output_batch_correct_v5/query_attribution_till_032423_valid_units.csv', index=False)

# fix missing

In [2]:
df_missed = pd.read_csv('appen/output_batch_correct_v5/query_attribution_missed_032423.csv')
df = pd.read_csv('appen/output_batch_correct_v5/Appen_69units.csv')

In [7]:
len(df_missed)

410

In [6]:
set(df.label_ordering) - set(df_missed.label_ordering)

set()

In [15]:
df_merge = df.merge(df_original.rename(columns={'query': 'query_original', 'sample_method': 'sample_method_original', 
    'top_query_classification_taxonomy': 'top_query_classification_taxonomy_original'}), on='label_ordering', how='left')

In [20]:
df_merge[df_merge['query'] != df_merge['query_original']]

Unnamed: 0,query,sample_method,label_ordering,top_query_classification_taxonomy,no_attributes,unlisted_value,rater_output,query_original,sample_method_original,top_query_classification_taxonomy_original
48,visÃ£o noturna pard nv007s,uniform,3133903,Sports > Hunting > Hunting Optics > Night Visions,,,Sports > Hunting > Hunting Optics > Night Visions,visão noturna pard nv007s,uniform,Sports > Hunting > Hunting Optics > Night Visions


In [21]:
df_merge['query'] = df_merge['query_original']

In [22]:
df_merge[df_merge['query'] != df_merge['query_original']]

Unnamed: 0,query,sample_method,label_ordering,top_query_classification_taxonomy,no_attributes,unlisted_value,rater_output,query_original,sample_method_original,top_query_classification_taxonomy_original


In [18]:
df_merge['query_original'].isna().any()

False

In [23]:
correct = []
errors = []
empty = []
for i in df_merge.to_dict('records'):
    res = [] if ((not isinstance(i['rater_output'], str)) and (np.isnan(i['rater_output']))) else i['rater_output'].split('\n')
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
len(df), len(correct), len(errors), len(empty), len(correct) + len(errors)

(69, 0, 69, 0, 69)

In [27]:
pd.DataFrame(errors).to_csv('appen/output_batch_correct_v5/Appen_69units_invalid.csv', index=False)

# rework missing

In [44]:
df_missed = pd.read_csv('appen/output_batch_correct_v5/query_attribution_missed_032423.csv')
df = pd.read_csv('appen/output_batch_correct_v5/Query Att_69units_rework.csv')

In [45]:
df_merge = df.merge(df_original.rename(columns={'query': 'query_original', 'sample_method': 'sample_method_original', 
    'top_query_classification_taxonomy': 'top_query_classification_taxonomy_original'}), on='label_ordering', how='left')

In [46]:
df_merge[df_merge['query'] != df_merge['query_original']]

Unnamed: 0,label_ordering,sample_method,query,l2_category,query_category,no_attributes,unlisted_value,rater_output,query_original,sample_method_original,top_query_classification_taxonomy_original


In [47]:
df_merge['query_original'].isna().any()

False

In [48]:
len(set(df['query'])), len(df)

(69, 69)

In [49]:
correct = []
errors = []
empty = []
for i in df_merge.to_dict('records'):
    res = [] if ((not isinstance(i['rater_output'], str)) and (np.isnan(i['rater_output']))) else i['rater_output'].split('\n')
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
len(df), len(correct), len(errors), len(empty), len(correct) + len(errors)

(69, 69, 0, 24, 69)

In [50]:
pd.DataFrame(correct).to_csv('appen/output_batch_correct_v5/Appen_69units_rework_valid.csv', index=False)