In [1]:
import pandas as pd
from collections import defaultdict
import dvc.api
from tqdm import tqdm
import numpy as np
from functools import reduce

# get original data

In [2]:
df_original = pd.concat([ 
    pd.read_csv(dvc.api.get_url('modelling/notebooks/query_attr_extract_appen_label/appen_query_attribution_batch1.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git')), 
    pd.read_csv(dvc.api.get_url('modelling/notebooks/query_attr_extract_appen_label/appen_query_attribution_batch2.csv', 
        repo='git@github.com:ContextLogic/multitask-llm-rnd.git'))
])

# get attributes

In [3]:
df_attributes = pd.read_csv('../attribute_extraction_metadata_template/attribute_definition_top25L2_filtered_augmented_redacted_addedsinglemulti_01182023.csv')

df_attributes_group = df_attributes.groupby('attribute_field').agg({
    'category_attributevalue': lambda x: reduce(lambda x, y: x+y, [eval(i) for i in x])
}).reset_index()

In [4]:
len(df_attributes_group), df_attributes_group.category_attributevalue.apply(len).sum()

(230, 516378)

In [5]:
attributes = set(df_attributes_group['attribute_field'])

In [6]:
category_paths = set(df_attributes['category'])

In [7]:
more_paths = []
for i in category_paths:
    tmp = []
    for j in i.split(' > '):
        tmp.append(j)
        more_paths.append(' > '.join(tmp))
category_paths.update(more_paths)

# existing appen data

In [8]:
df1 = pd.read_csv('appen/output_batch_correct_v2/query_attribution_02.03.23.csv')
df2 = pd.read_csv('appen/output_batch_correct_v2/query_attribution_03.03.23_rework.csv')
df3 = pd.read_csv('appen/output_batch_correct_v2/query_attribution_03.03.23_pending.csv')
df4 = pd.read_csv('appen/output_batch_correct_v2/query_attribution_03.10.23_rework.csv')
df5 = pd.read_csv('appen/output_batch_correct_v2/query_attribution_03.10.23_new_125k.csv')

  df5 = pd.read_csv('appen/output_batch_correct_v2/query_attribution_03.10.23_new_125k.csv')


In [16]:
df1 = df1.rename(columns={'L2_Category': 'l2_category', 'Label_Ordering': 'label_ordering', 'Query': 'query', 'Query_Category': 'query_category', 
    'Rater_Answer': 'rater_output', 'Sample_Method': 'sample_method'
})

In [15]:
set(df1), set(df2), set(df3), set(df4), set(df5)

({'L2_Category',
  'Label_Ordering',
  'Query',
  'Query_Category',
  'Rater_Answer',
  'Sample_Method'},
 {'l2_category',
  'label_ordering',
  'no_attributes',
  'query',
  'query_category',
  'rater_output',
  'sample_method',
  'translated_query',
  'unlisted_value'},
 {'l2_category',
  'label_ordering',
  'no_attributes',
  'query',
  'query_category',
  'rater_output',
  'sample_method',
  'unlisted_value'},
 {'l2_category',
  'label_ordering',
  'no_attributes',
  'query',
  'query_category',
  'rater_output',
  'sample_method',
  'unlisted_value'},
 {'l2_category',
  'label_ordering',
  'no_attributes',
  'query',
  'query_category',
  'rater_output',
  'sample_method',
  'unlisted_value'})

In [19]:
df = pd.concat([df1, df2, df3, df4, df5])

In [21]:
df_uniq = df.drop_duplicates('label_ordering', keep='last')

In [22]:
len(df), len(df_uniq)

(281563, 249723)

In [26]:
df_uniq_merge = df_uniq.merge(df_original[['label_ordering', 'query']].rename(columns={'query': 'query_original'}), on='label_ordering', how='inner')

In [27]:
len(df_uniq_merge)

249723

In [31]:
df_uniq_merge.loc[df_uniq_merge['query'] != df_uniq_merge['query_original'], 'query'] = \
    df_uniq_merge.loc[df_uniq_merge['query'] != df_uniq_merge['query_original'], 'query_original']

In [33]:
correct = []
errors = []
empty = []
for i in df_uniq_merge.to_dict('records'):
    res = [] if ((not isinstance(i['rater_output'], str)) and (np.isnan(i['rater_output']))) else i['rater_output'].split('\n')
    if len(res) == 0:
        empty.append(i)
    is_correct = True
    for j in res:
        if len(j.split(' > ')) < 2:
            i['explanation'] = f"{j} has no attribute name"
            errors.append(i)
            is_correct = False
            break
        if j.split(' > ')[-2] not in attributes:
            i['explanation'] = f"{j} has invalid attribute name, because {j.split(' > ')[-2]} is not a valid attribute name"
            errors.append(i)
            is_correct = False
            break
    if is_correct:
        correct.append(i)
len(df), len(correct), len(errors), len(empty), len(correct) + len(errors)

(281563, 149423, 100300, 39951, 249723)

In [34]:
len(empty) / len(correct)

0.2673684774097696

In [35]:
df_correct = pd.DataFrame(correct)
df_error = pd.DataFrame(errors)

In [38]:
len(df_correct), len(df_error)

(149423, 100300)

In [37]:
df_correct.to_csv('appen/output_batch_correct_v3/query_attribution_till_030923_valid_units.csv')
df_error.to_csv('appen/output_batch_correct_v3/query_attribution_till_030923_invalid_units.csv')