In [40]:
import pandas as pd
from collections import Counter
import re
import numpy as np

In [41]:
df_appen = pd.read_csv('appen/Query Classification_12.12.22.csv').sort_values('label_ordering')

In [42]:
df_appen = df_appen.rename(columns={'product_classification': 'query_classifications'})

In [43]:
df_offshore = pd.read_excel('offshore/TieBreaker File - 8 Dec 22.xlsx').sort_values('Sr No')

In [44]:
df_offshore = df_offshore[['Sr No', 'query', 'Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths']].rename(
    columns={'Sr No': 'label_ordering', 
        'Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths': 'query_classifications'}
)

In [45]:
df_tax = pd.read_json('../taxonomy/wish_newtax.json', lines=True)

In [46]:
df_tax = df_tax[df_tax.category_path.apply(len) > 0]

In [47]:
class Trie:
    def __init__(self):
        self.trie = {}

    def add_text(self, text):
        node = self.trie
        for ind, i in enumerate(text):
            if i not in node:
                node[i] = {}
            node = node[i]
            if ind == len(text) - 1:
                node['<EOS>'] = 1
    
    def extract_from_text(self, text):
        node = self.trie
        res = []
        cur_res = ''
        for ind, i in enumerate(text):
            if i in node:
                node = node[i]
                cur_res += i
            else:
                try:
                    assert '<EOS>' in node
                except:
                    print('ERROR: ', text)
                    return None
                res.append(cur_res)
                cur_res = ''
                node = self.trie
        if len(cur_res) > 0:
            res.append(cur_res)
        
        return res

In [48]:
trie = Trie()

In [49]:
for i in df_tax.category_path.apply(lambda x: x.strip()).tolist():
    trie.add_text(i)
for i in df_tax.category_path.apply(lambda x: x.strip().lower()).tolist():
    trie.add_text(i)

In [50]:
trie.add_text('nan')
trie.add_text('No Categories Match')

In [51]:
manual_correction = {
    "Beauty & Health > Sexual Wellness > Sex Toys,Beauty & Health > Sexual Wellness > SM Products ,nan": \
        "Beauty & Health > Sexual Wellness > Sex Toys,Beauty & Health > Sexual Wellness > SM Products,nan",
    "Apparel Accessories > Girls' Accessories > Girls' Hair Accessories -,Apparel Accessories > Women's Hair Accessories,sports > fitness & body building > yoga > yoga hair bands": \
        "Apparel Accessories > Girls' Accessories > Girls' Hair Accessories,Apparel Accessories > Women's Hair Accessories,sports > fitness & body building > yoga > yoga hair bands",
    "Women's Clothing > Dresses ,women's clothing > weddings & events > wedding dresses,mother & kids > pregnancy & maternity > dresses": \
        "Women's Clothing > Dresses,women's clothing > weddings & events > wedding dresses,mother & kids > pregnancy & maternity > dresses",
    "Women's Clothing > Dresses, Mother & Kids > Girls' Baby Clothing > Dresses,Mother & Kids > Pregnancy & Maternity > Dresses": \
        "Women's Clothing > Dresses,Mother & Kids > Girls' Baby Clothing > Dresses,Mother & Kids > Pregnancy & Maternity > Dresses", 
    "Home & Garden > Arts, Crafts & Sewing > Apparel Sewing & Fabric > Buttons,Retrieving data. Wait a few seconds and try to cut or copy again.,nan": \
        "Home & Garden > Arts, Crafts & Sewing > Apparel Sewing & Fabric > Buttons,nan,nan"
}

In [52]:
df_offshore['query_classifications'] = df_offshore.query_classifications.apply(lambda x: manual_correction[x] if x in manual_correction else x)

In [53]:
df_offshore['query_classification_lists'] = df_offshore.query_classifications.apply(trie.extract_from_text)

In [54]:
path2id = {}
for i in df_tax.to_dict('records'):
    path2id[i['category_path'].lower().strip()] = i['id']
    path2id[i['category_path'].strip()] = i['id']

In [55]:
id2path = {}
for i in df_tax.to_dict('records'):
    id2path[i['id']] = i['category_path']

In [56]:
path2id['nan'] = -1
path2id['No Categories Match'] = -1

In [57]:
df_offshore['query_classification_ids'] = df_offshore['query_classification_lists'].apply(lambda x: [path2id[i] for i in x])

In [58]:
df_appen['query_classification_lists'] = df_appen['query_classifications'].apply(trie.extract_from_text)
df_appen['query_classification_ids'] = df_appen['query_classification_lists'].apply(lambda x: [path2id[i] for i in x])

In [59]:
df_join = df_offshore[['query', 'query_classification_ids']].rename(columns={'query_classification_ids': 'offshore_query_classification_ids'}).merge( 
    df_appen[['query', 'query_classification_ids']].rename(columns={'query_classification_ids': 'appen_query_classification_ids'}), 
    on='query', how='inner'
)

In [60]:
def convert_id_to_path(li):
    if set(li) == set([-1]):
        return []
    else:
        res = [] 
        for i in li:
            if int(i) in id2path:
                res.append(id2path[int(i)])
        return res

In [61]:
df_join['offshore_query_classification_lists'] = df_join['offshore_query_classification_ids'].apply(convert_id_to_path).apply(tuple)
df_join['appen_query_classification_lists'] = df_join['appen_query_classification_ids'].apply(convert_id_to_path).apply(tuple)

In [62]:
df_join[['query', 'offshore_query_classification_lists', 'appen_query_classification_lists']].to_excel(
    'analysis/appen_label_quality_check_20221212.xlsx'
)

In [63]:
from sklearn.metrics import accuracy_score, classification_report

In [64]:
lab_offshore = np.zeros((len(df_join), max(df_tax['id'])))
lab_appen = np.zeros((len(df_join), max(df_tax['id'])))

In [65]:
accuracy_score(
    y_true=df_join['offshore_query_classification_lists'].apply(lambda x: '|'.join(x)),
    y_pred=df_join['appen_query_classification_lists'].apply(lambda x: '|'.join(x))
)

0.4125560538116592

In [66]:
for ind, i in enumerate(df_join.to_dict('records')):
    for j in i['offshore_query_classification_ids']:
        if j != -1:
            lab_offshore[ind][j] = 1. 
    for j in i['appen_query_classification_ids']:
        if j != -1:
            lab_appen[ind][j] = 1. 
    

In [67]:
df_metrics = pd.DataFrame(classification_report(y_true=lab_offshore, y_pred=lab_appen, zero_division=0, output_dict=True)).T

In [68]:
df_metrics

Unnamed: 0,precision,recall,f1-score,support
0,0.000000,0.000000,0.000000,0.0
1,0.000000,0.000000,0.000000,0.0
2,0.000000,0.000000,0.000000,0.0
3,0.000000,0.000000,0.000000,0.0
4,0.000000,0.000000,0.000000,0.0
...,...,...,...,...
6144,0.000000,0.000000,0.000000,0.0
micro avg,0.748879,0.414392,0.533546,403.0
macro avg,0.022593,0.022023,0.022074,403.0
weighted avg,0.440447,0.414392,0.420377,403.0


In [69]:
df_join.offshore_query_classification_ids.apply(lambda x: len([i for i in x if i != -1])).value_counts()

1    112
3     73
2     36
0      2
Name: offshore_query_classification_ids, dtype: int64

In [70]:
df_join.appen_query_classification_ids.apply(lambda x: len([i for i in x if i != -1])).value_counts()

1    223
Name: appen_query_classification_ids, dtype: int64