In [4]:
import pandas as pd 
from copy import deepcopy
from thefuzz import process as fuzz_process
from tqdm import tqdm

In [5]:
v121_tax = pd.read_json('../../../data/taxonomy/wish_newtax.json', lines=True)

In [6]:
paths = v121_tax[v121_tax.category_path.apply(len) > 0].category_path.tolist()

In [7]:
paths.append("No Categories Match")

In [8]:
paths_set = set(paths)

In [9]:
def transform_nans(x):
    if pd.isna(x):
        return ""
    else:
        return str(x)

def transform_unnamed(l):
    l = deepcopy(l)
    lout = []
    for ind, i in enumerate(l):
        if i.startswith('Unnamed:'):
            lout.append(l[ind-1])
            l[ind] = l[ind-1]
        else:
            lout.append(i)
    return lout
    
def recreate_col_names(df):
    col_names = []
    for n1, n2 in zip(transform_unnamed(df.columns.tolist()), [transform_nans(i) for i in df.loc[0].tolist()]):
        if len(n1) > 0 and len(n2) > 0:
            if n1.endswith(n2):
                n3 = n1 
            elif n2.startswith(n1):
                n3 = n2
            else:
                n3 = n1 + " " + n2
        elif len(n1) > 0 and len(n2) == 0:
            n3 = n1
        elif len(n1) == 0 and len(n2) > 0:
            n3 = n2
        else:
            raise Exception('empty columne names')
        col_names.append(n3)
    return col_names

def rename_recreated_cols(df):
    df.columns = recreate_col_names(df)
    df = df.loc[1:]
    return df

def match_full_path(df, manual_correction=None):
    recs = []
    for i in tqdm(df.to_dict('records')):
        for col in [ 
            'Most confident taxonomy path Full Path', 
            '2nd Most confident taxonomy path Full Path', 
            '3rd Most confident taxonomy path Full Path'
        ]:
            p = i[col]
            if pd.isna(p):
                i[col] = None
                continue
            if p.startswith('Retrieving data'):
                print(p)
                i[col] = None
                continue
            p = p.strip()
            if manual_correction is not None and p in manual_correction:
                print(f"manually correct {p} into {manual_correction[p]}")
                p = manual_correction[p]
                i[col] = p
            if p not in paths_set:
                match = fuzz_process.extractOne(query=p, choices=paths)
                print(col, p, match)
                if match[1] <= 90:
                    print('discard')
                    i[col] = None
                else:
                    print('keep')
                    i[col] = match[0]
        recs.append(i)
    return pd.DataFrame(recs)

def collect_all_paths(df):
    recs = []
    for i in tqdm(df.to_dict('records')):
        paths_collector = []

        if i['Most confident taxonomy path Full Path'] == "No Categories Match":
            assert pd.isna(i['2nd Most confident taxonomy path Full Path']) & \
                pd.isna(i['3rd Most confident taxonomy path Full Path'])
        
        for col in [ 
            'Most confident taxonomy path Full Path', 
            '2nd Most confident taxonomy path Full Path', 
            '3rd Most confident taxonomy path Full Path'
        ]:
            if not pd.isna(i[col]):
                paths_collector.append(i[col])
        
        assert len(paths_collector) > 0
        while len(paths_collector) != 3:
            paths_collector.append(None)
        i['All SortedByConfidenceHighestFirst taxonomy path Full Paths'] = paths_collector
        recs.append(i)
    return pd.DataFrame(recs)


# day 1

In [10]:
df_day1_labeler1 = pd.read_excel('day1/Labeller 1 - 30 Nov & 1 Dec 2022.xlsx')

In [11]:
df_day1_labeler2 = pd.read_excel('day1/Labeller 2 - 30 Nov & 1 Dec 2022.xlsx', sheet_name='Query (2)')

In [12]:
df_day1_labeler1 = rename_recreated_cols(df_day1_labeler1)
df_day1_labeler2 = rename_recreated_cols(df_day1_labeler2)

In [13]:
assert tuple(df_day1_labeler1.columns.tolist()) == tuple(df_day1_labeler2.columns.tolist())
assert len(df_day1_labeler1) == len(df_day1_labeler2)

In [14]:
df_day1_labeler1 = match_full_path(df_day1_labeler1)

 17%|█▋        | 134/780 [00:01<00:05, 124.72it/s]

Most confident taxonomy path Full Path toys & hobbies > puzzles & games ('Toys & Hobbies > Puzzles & Games', 100)
keep
Retrieving data. Wait a few seconds and try to cut or copy again.
Retrieving data. Wait a few seconds and try to cut or copy again.


 85%|████████▌ | 664/780 [00:02<00:00, 354.86it/s]

3rd Most confident taxonomy path Full Path Home & Garden > Home Decor > Wall Sticker ('Home & Garden > Home Decor > Wall Stickers', 99)
keep


100%|██████████| 780/780 [00:03<00:00, 250.41it/s]

2nd Most confident taxonomy path Full Path Watches > Men's Watches > Quartz Watch ("Watches > Men's Watches > Quartz Watches", 97)
keep





In [15]:
manual_correction = {
    "Women's Clothing > Outerwear > JacketsWomen's Clothing > Outerwear > Blazers": "Women's Clothing > Outerwear > Blazers",
    "Luggage & Bags > Coin Purses & Holders > Coin PursesLuggage & Bags > Men's Bags > Wallets": "Luggage & Bags > Coin Purses & Holders > Coin Purses"
}

In [16]:
df_day1_labeler2 = match_full_path(df_day1_labeler2, manual_correction=manual_correction)

 17%|█▋        | 134/780 [00:01<00:05, 122.33it/s]

Most confident taxonomy path Full Path toys & hobbies > puzzles & games ('Toys & Hobbies > Puzzles & Games', 100)
keep


 24%|██▍       | 187/780 [00:01<00:05, 106.36it/s]

Most confident taxonomy path Full Path home & garden > bathroom products > bathroom gadgets > toothpaste squeezers ('Home & Garden > Bathroom Products > Bathroom Gadgets > Toothpaste Squeezers', 100)
keep
manually correct Women's Clothing > Outerwear > JacketsWomen's Clothing > Outerwear > Blazers into Women's Clothing > Outerwear > Blazers
manually correct Luggage & Bags > Coin Purses & Holders > Coin PursesLuggage & Bags > Men's Bags > Wallets into Luggage & Bags > Coin Purses & Holders > Coin Purses


100%|██████████| 780/780 [00:02<00:00, 336.65it/s]

2nd Most confident taxonomy path Full Path Mother & Kids > Activity & Gear > Bouncers,Jumperms & Swings ('Mother & Kids > Activity & Gear > Bouncers,Jumpers & Swings', 99)
keep





In [17]:
df_day1_labeler1_short = collect_all_paths(df_day1_labeler1[['Sr No', 'cnt', 'gmv', 'sample_method', 'query', 'translated query (if needed)', 
    'Most confident taxonomy path Full Path', '2nd Most confident taxonomy path Full Path', 
    '3rd Most confident taxonomy path Full Path', 'Names', 'Date of Search'
]])


100%|██████████| 780/780 [00:00<00:00, 226703.42it/s]


In [18]:
df_day1_labeler2_short = collect_all_paths(df_day1_labeler2[['Sr No', 'cnt', 'gmv', 'sample_method', 'query', 'translated query (if needed)', 
    'Most confident taxonomy path Full Path', '2nd Most confident taxonomy path Full Path', 
    '3rd Most confident taxonomy path Full Path', 'Names', 'Date of Search'
]])

100%|██████████| 780/780 [00:00<00:00, 199375.78it/s]


In [19]:
col_names = []
for i in df_day1_labeler1_short.columns:
    if i in [ 
        'Most confident taxonomy path Full Path', 
        '2nd Most confident taxonomy path Full Path', 
        '3rd Most confident taxonomy path Full Path', 
        'Names',
        'Date of Search', 
        'All SortedByConfidenceHighestFirst taxonomy path Full Paths'
    ]:
        col_names.append(f"Labeler1 {i}")
    else:
        col_names.append(i)
df_day1_labeler1_short.columns = col_names

col_names = []
for i in df_day1_labeler2_short.columns:
    if i in [ 
        'Most confident taxonomy path Full Path', 
        '2nd Most confident taxonomy path Full Path', 
        '3rd Most confident taxonomy path Full Path', 
        'Names',
        'Date of Search', 
        'All SortedByConfidenceHighestFirst taxonomy path Full Paths'
    ]:
        col_names.append(f"Labeler2 {i}")
    else:
        col_names.append(i)
df_day1_labeler2_short.columns = col_names

In [20]:
df_day1_merged_short = df_day1_labeler1_short.merge(df_day1_labeler2_short[[ 
    'Sr No',
    'Labeler2 Most confident taxonomy path Full Path', 
    'Labeler2 2nd Most confident taxonomy path Full Path', 
    'Labeler2 3rd Most confident taxonomy path Full Path', 
    'Labeler2 Names',
    'Labeler2 Date of Search', 
    'Labeler2 All SortedByConfidenceHighestFirst taxonomy path Full Paths'
]], on="Sr No", how='inner')

In [21]:
assert len(df_day1_merged_short) == len(df_day1_labeler1_short) == len(df_day1_labeler2_short)

In [22]:
assert (df_day1_merged_short['Labeler1 All SortedByConfidenceHighestFirst taxonomy path Full Paths'].apply(len) == 3).all()
assert (df_day1_merged_short['Labeler2 All SortedByConfidenceHighestFirst taxonomy path Full Paths'].apply(len) == 3).all()

In [23]:
df_day1_merged_short_agree = df_day1_merged_short[
    df_day1_merged_short['Labeler1 All SortedByConfidenceHighestFirst taxonomy path Full Paths'].apply(tuple) == \
        df_day1_merged_short['Labeler2 All SortedByConfidenceHighestFirst taxonomy path Full Paths'].apply(tuple)
]
df_day1_merged_short_disagree = df_day1_merged_short[
    df_day1_merged_short['Labeler1 All SortedByConfidenceHighestFirst taxonomy path Full Paths'].apply(tuple) != \
        df_day1_merged_short['Labeler2 All SortedByConfidenceHighestFirst taxonomy path Full Paths'].apply(tuple)
]

In [24]:
df_day1_merged_short_agree['Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths'] = df_day1_merged_short_agree[ 
    'Labeler2 All SortedByConfidenceHighestFirst taxonomy path Full Paths'
].tolist()
df_day1_merged_short_disagree['Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_day1_merged_short_agree['Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths'] = df_day1_merged_short_agree[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_day1_merged_short_disagree['Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths'] = None


In [25]:
len(df_day1_merged_short_agree) / len(df_day1_merged_short)

0.0858974358974359

In [26]:
df_day1_merged_short_agree.to_json("day1/agree_11302022_12012022_corrected.json", orient='records', lines=True)

In [27]:
df_day1_merged_short_disagree.to_excel("day1/disagree_11302022_12012022_corrected.xlsx")