In [129]:
import pandas as pd
from collections import Counter
import re
import numpy as np

In [130]:
df_tax = pd.read_json('../taxonomy/wish_newtax.json', lines=True)
df_tax = df_tax[df_tax.category_path.apply(len) > 0]

In [131]:
df_original = pd.read_json("../query/top_used_wish_queries_offshore_sample_100000.json", lines=True)

In [132]:
df_offshore_test = pd.read_json('./processed/Offshore_Labelled_Query_Classification_Test.json', lines=True)

In [133]:
appen_files = [
    'appen/Query Classification_01.06.23.xlsx',
    'appen/Query Classification_12.30.22.xlsx'
]
df_appen = pd.concat([pd.read_excel(i) for i in appen_files])

In [134]:

df_appen = df_appen.merge(df_original[['index', 'query', 'sample_method']].rename(columns={'query': 'orig_query'}), on='index', how='inner')
df_appen.loc[df_appen['query'] != df_appen['orig_query'], "query"] = df_appen.loc[df_appen['query'] != df_appen['orig_query'], "orig_query"]
assert len(df_appen) == len(set(df_appen['query']))


In [135]:
offshore_files = [ 
    'offshore/TieBreaker File - 8 Dec 22.xlsx',
    'offshore/TieBreaker File - 9 Dec 22.xlsx',
    'offshore/TieBreaker File - 12 Dec 22.xlsx',
    'offshore/TieBreaker File - 13 Dec 22.xlsx',
    'offshore/TieBreaker File - 14 Dec 22.xlsx',
    'offshore/TieBreaker File - 15 Dec 22.xlsx',
    'offshore/TieBreaker File - 16 Dec 22.xlsx',
    'offshore/TieBreaker File - 19 Dec 22.xlsx',
    'offshore/TieBreaker File - 20 Dec 22.xlsx',
    'offshore/TieBreaker File - 21 Dec 22.xlsx',
    'offshore/TieBreaker File - 22 Dec 22.xlsx',
    'offshore/TieBreaker File - 23 Dec 22.xlsx',
    'offshore/TieBreaker File - 26 Dec 22.xlsx',
    'offshore/TieBreaker File - 27 _ 28 Dec 22.xlsx',
    'offshore/TieBreaker File - 29 Dec.xlsx',
    'offshore/TieBreaker File - 30 Dec 22 to 1 Jan 23.xlsx',
    'offshore/TieBreaker File - 2 Jan 23.xlsx',
    'offshore/TieBreaker File - 3 _ 4 Jan 23.xlsx',
    'offshore/TieBreaker File - 5 Jan 23.xlsx',
    'offshore/TieBreaker File - 6 _ 7 Jan 23.xlsx'
]
dfs = []
for i in offshore_files:
    tmp = pd.read_excel(i)
    tmp['filename'] = i
    dfs.append(tmp)

df_offshore = pd.concat(dfs)[['index', 'filename', 'Sr No', 'query', 'sample_method', 'gmv', 'cnt', 
    'Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths']].rename( 
        columns={
            'Sr No': 'label_ordering', 
            'Tiebreaked All SortedByConfidenceHighestFirst taxonomy path Full Paths': 'query_classifications'}
    )


df_offshore['label_ordering'] = df_offshore['label_ordering'].astype(int)
df_offshore['index'] = df_offshore['index'].astype(int)
df_offshore['query'] = df_offshore['query'].astype(str)
df_offshore = df_offshore.merge(df_original[['index', 'query']].rename(columns={'query': 'orig_query'}), on='index', how='inner')
df_offshore.loc[df_offshore['query'] != df_offshore['orig_query'], "query"] = df_offshore.loc[df_offshore['query'] != df_offshore['orig_query'], "orig_query"]
assert len(df_offshore) == len(set(df_offshore['query']))


In [136]:

errors = []
class Trie:
    def __init__(self):
        self.trie = {}

    def add_text(self, text):
        node = self.trie
        for ind, i in enumerate(text):
            if i not in node:
                node[i] = {}
            node = node[i]
            if ind == len(text) - 1:
                node['<EOS>'] = 1
    
    def extract_from_text(self, text):
        node = self.trie
        res = []
        cur_res = ''
        for ind, i in enumerate(text):
            if i in node:
                node = node[i]
                cur_res += i
            else:
                try:
                    assert '<EOS>' in node
                except:
                    print('ERROR: ', text, res)
                    errors.append(text)
                    return None
                res.append(cur_res)
                cur_res = ''
                node = self.trie
        if len(cur_res) > 0:
            res.append(cur_res)
        
        return res
trie = Trie()
for i in df_tax.category_path.apply(lambda x: x.strip()).tolist():
    trie.add_text(i)
for i in df_tax.category_path.apply(lambda x: x.strip().lower()).tolist():
    trie.add_text(i)
trie.add_text('nan')
trie.add_text('No Categories Match')
manual_correction = {
    "Beauty & Health > Sexual Wellness > Sex Toys,Beauty & Health > Sexual Wellness > SM Products ,nan": \
        "Beauty & Health > Sexual Wellness > Sex Toys,Beauty & Health > Sexual Wellness > SM Products,nan",
    "Apparel Accessories > Girls' Accessories > Girls' Hair Accessories -,Apparel Accessories > Women's Hair Accessories,sports > fitness & body building > yoga > yoga hair bands": \
        "Apparel Accessories > Girls' Accessories > Girls' Hair Accessories,Apparel Accessories > Women's Hair Accessories,sports > fitness & body building > yoga > yoga hair bands",
    "Women's Clothing > Dresses ,women's clothing > weddings & events > wedding dresses,mother & kids > pregnancy & maternity > dresses": \
        "Women's Clothing > Dresses,women's clothing > weddings & events > wedding dresses,mother & kids > pregnancy & maternity > dresses",
    "Women's Clothing > Dresses, Mother & Kids > Girls' Baby Clothing > Dresses,Mother & Kids > Pregnancy & Maternity > Dresses": \
        "Women's Clothing > Dresses,Mother & Kids > Girls' Baby Clothing > Dresses,Mother & Kids > Pregnancy & Maternity > Dresses", 
    "Home & Garden > Arts, Crafts & Sewing > Apparel Sewing & Fabric > Buttons,Retrieving data. Wait a few seconds and try to cut or copy again.,nan": \
        "Home & Garden > Arts, Crafts & Sewing > Apparel Sewing & Fabric > Buttons,nan,nan",
    "Men's Clothing > Tops & Tees > T-Shirts,Watches > Men's Watches > Quartz Watch,Home & Garden > Kitchen,Dining & Bar > Coffeeware > Coffee Cups & Mugs": \
         "Men's Clothing > Tops & Tees > T-Shirts,Watches > Men's Watches > Quartz Watches,Home & Garden > Kitchen,Dining & Bar > Coffeeware > Coffee Cups & Mugs",
    "education & office supplies > writing & correction supplies > stencils,Crafts & Sewing > Scrapbooking & Stamping > Cutting Dies,home & garden > arts, crafts & sewing > apparel sewing & fabric > diy craft supplies": \
        "education & office supplies > writing & correction supplies > stencils,Home & Garden > Arts, Crafts & Sewing > Scrapbooking & Stamping > Cutting Dies,home & garden > arts, crafts & sewing > apparel sewing & fabric > diy craft supplies",
    "Home & Garden > Home Decor > Painting & Calligraphy,Home Improvement > Painting Supplies & Wall Treatments > Wallpapers,Home & Garden > Home Decor > Wall Sticker": \
        "Home & Garden > Home Decor > Painting & Calligraphy,Home Improvement > Painting Supplies & Wall Treatments > Wallpapers,Home & Garden > Home Decor > Wall Stickers",
    "Computer & Office > Office Electronics > Printer,Computer & Office > Office Electronics > All in One Printer,Education & Office Supplies > Paper > Printer Paper": \
        "Computer & Office > Office Electronics > Printers,Computer & Office > Office Electronics > All in One Printer,Education & Office Supplies > Paper > Printer Paper",
    "Home & Garden > Home Textile > Bedding > Pillow Cases, Home & Garden > Home Textile > Bedding > Quilts,Home & Garden > Home Textile > Duvet Cover Sets": \
        "Home & Garden > Home Textile > Bedding > Pillow Cases,Home & Garden > Home Textile > Bedding > Quilts,Home & Garden > Home Textile > Duvet Cover Sets",
    "Accessories > Earrings >\xa0Hoop Earrings,nan,nan": "Jewelry & Accessories > Earrings > Hoop Earrings,nan,nan", 
    "Shoes > Men's Shoes > Men's Boots,Shoes > Women's Shoes > Women's Boots,Children's Shoes > Boys > Boots Mother & Kids > Children's Shoes > Boys > Boots": \
        "Shoes > Men's Shoes > Men's Boots,Shoes > Women's Shoes > Women's Boots,Mother & Kids > Children's Shoes > Boys > Boots",
    "Beauty & Health > Sexual Wellness > SM Products > Bondage Gear,Beauty & Health > Sexual Wellness ,nan": \
        "Beauty & Health > Sexual Wellness > SM Products > Bondage Gear,Beauty & Health > Sexual Wellness,nan",
    "No Category Match": "No Categories Match,nan,nan",
    "car assessories interior,Mother & Kids > Car Seats & Accessories > Head & Body Supports,Mother & Kids > Car Seats & Accessories > Rear Facing Mirrors": \
        "Mother & Kids > Car Seats & Accessories > Head & Body Supports,Mother & Kids > Car Seats & Accessories > Rear Facing Mirrors,nan",
    "No Category Match,nan,nan": "No Categories Match,nan,nan",
    "Education & Office Supplies > Books & Magazines ,nan,nan": "Education & Office Supplies > Books & Magazines,nan,nan",
    "Apparel Accessories > Men's Accessories > Men's Masks,Apparel Accessories > Women's Accessories > Women's Masks  ,nan": \
        "Apparel Accessories > Men's Accessories > Men's Masks,Apparel Accessories > Women's Accessories > Women's Masks,nan",
    "Women's Clothing > Dresses ,Women's Clothing > Sweaters > Dresses,nan": \
        "Women's Clothing > Dresses,Women's Clothing > Sweaters > Dresses,nan",
    "Women's Clothing > Tops > Blouses & Shirts - ,nan,nan": "Women's Clothing > Tops > Blouses & Shirts,nan,nan",
    "Apparel Accessories > Girls' Accessories > Girls' Hair Accessories -,nan,nan": \
        "Apparel Accessories > Girls' Accessories > Girls' Hair Accessories,nan,nan",
    "Women's Clothing > Tops > Blouses & Shirts ,Women's Clothing > Bottoms > Pants,Women's Clothing > Dresses": \
        "Women's Clothing > Tops > Blouses & Shirts,Women's Clothing > Bottoms > Pants,Women's Clothing > Dresses",
    "Women's Clothing > Women's Exotic Apparel > Lingerie sets,nan,nan": \
        "Women's Clothing > Women's Exotic Apparel > Lingerie Sets,nan,nan",
    "Home & Garden > Garden Supplies > Pest Control > Foggers & Sprayers,\xa0,": \
        "Home & Garden > Garden Supplies > Pest Control > Foggers & Sprayers,nan,nan", 
    "Home & Garden > Kitchen,Dining & Bar > Kitchen Knives & Accessories > Blocks & Roll Bags, Home & Garden > Kitchen,Dining & Bar > Kitchen Knives & Accessories,nan": \
        "Home & Garden > Kitchen,Dining & Bar > Kitchen Knives & Accessories > Blocks & Roll Bags,Home & Garden > Kitchen,Dining & Bar > Kitchen Knives & Accessories,nan",
    "Automobiles & motorcycles > Auto replacement parts > exterior parts > chromium styling,Automobiles & Motorcycles > Auto Replacement Parts > Air Conditioning & Heat > Car Air Conditioning,nan": \
        "Automobiles & Motorcycles > Auto Replacement Parts > Exterior Parts > Chromium Styling,Automobiles & Motorcycles > Auto Replacement Parts > Air Conditioning & Heat > Car Air Conditioning,nan",
    "home improvement > bathroom fixtures > bathroom sinks, faucets & accessories,Home Decor > Bathroom >\xa0Faucets,nan": \
        "Home Improvement > Bathroom Fixtures > Bathroom Sinks, Faucets & Accessories > Basin Faucets,nan,nan",
    "Home Improvement > Bathroom Fixtures > Bathroom Sinks, Faucets & Accessories,Home Decor > Bathroom > Faucets,nan": \
        "Home Improvement > Bathroom Fixtures > Bathroom Sinks, Faucets & Accessories > Basin Faucets,nan,nan",
    "Women's Clothing > Dresses - ,Women's Clothing > Weddings & Events > Cocktail Dresses,Women's Clothing > Weddings & Events > Evening Dresses": \
        "Women's Clothing > Dresses,Women's Clothing > Weddings & Events > Cocktail Dresses,Women's Clothing > Weddings & Events > Evening Dresses",
    "Consumer Electronics > Smart Electronics > Wearable Devices > Smart watches,nan,nan": \
        "Consumer Electronics > Smart Electronics > Wearable Devices > Smart Watches,nan,nan",
    "Apparel Accessories > Men's Accessories > Men's Masks,Apparel Accessories > Women's Accessories > Women's Masks - ,Sports > Sports Accessories > Sports Face Masks > Cycling Face Mask": \
        "Apparel Accessories > Men's Accessories > Men's Masks,Apparel Accessories > Women's Accessories > Women's Masks,Sports > Sports Accessories > Sports Face Masks > Cycling Face Mask", 
    "No match category,nan,nan": "No Categories Match,nan,nan", 
    "Automotive > Parts & Accessories >\xa0Car Accessories,nan,nan": "Automobiles & Motorcycles > Interior Accessories,Automobiles & Motorcycles > Exterior Accessories,nan",
    "Sports > Fishing > Fishing Tools,sports > Fishing > Fishing Accessories,nan": \
        "Sports > Fishing > Fishing Tools,Sports > Fishing > Fishing Accessories,nan",
    "Hobbies > Outdoor & Camping Accessories >\xa0Sports Equipment,nan,nan": \
        "Sports,nan,nan",
    "Toys & hobbies > diecast & toy vehicles,toys & hobbies > play vehicles & models,toys & hobbies > play vehicles & models > railed, motor cars & bicycles": \
        "Toys & Hobbies > Diecast & Toy Vehicles,Toys & Hobbies > Play Vehicles & Models,Toys & Hobbies > Play Vehicles & Models > Railed, Motor Cars & Bicycles",
    "Hobbies > Outdoor & Camping Accessories > Sports Equipment,nan,nan": "Sports,nan,nan", 
    "Consumer electronics > power source > batteries,nan,nan": "Consumer Electronics > Power Source > Batteries,nan,nan",
    "Beauty & Health > Oral Hygiene > Teeth Whitening,Beauty & Health > oral hygiene > dental supplies > dental basic instrument,Beauty & Health > oral hygiene > dental supplies": \
        "Beauty & Health > Oral Hygiene > Teeth Whitening,Beauty & Health > Oral Hygiene > Dental Supplies > Dental Basic Instrument,Beauty & Health > Oral Hygiene > Dental Supplies",
    "Home & Garden > pet products > pet health care & hygiene > supplements & vitamins,Beauty & Health > Health Care > Personal Health Care,nan": \
        "Home & Garden > Pet Products > Pet Health Care & Hygiene > Supplements & Vitamins,Beauty & Health > Health Care > Personal Health Care,nan",
    "Jewelry & accessories > necklaces & pendants,Jewelry & Accessories > Customized Jewelry > Customized Necklaces,nan": \
        "Jewelry & Accessories > Necklaces & Pendants,Jewelry & Accessories > Customized Jewelry > Customized Necklaces,nan",
    "Beauty & Health > Skin Care Tools > Jade Roller,Beauty & health > skin care tools > facial care tool,Beauty & Health > Skin Care Tools > Face Lift Devices": \
        "Beauty & Health > Skin Care Tools > Jade Roller,Beauty & Health > Skin Care Tools > Facial Care Tool,Beauty & Health > Skin Care Tools > Face Lift Devices", 
    "Women's Clothing > Tops > Tees,Women's Clothing > Tops > Blouses & Shirts ,nan": \
        "Women's Clothing > Tops > Tees,Women's Clothing > Tops > Blouses & Shirts,nan",
    "Beauty & Health > Skin Care > Face > Cleansers,Beauty & Health > Makeup > Makeup Remover,Beauty & Health > Skin Care > Face > Clean": \
        "Beauty & Health > Skin Care > Face > Cleansers,Beauty & Health > Makeup > Makeup Remover,nan",
    "Shoes > women's shoes > women's flats,Shoes > Women's Shoes > Loafers,Shoes > Women's Shoes > Women's Pumps": \
        "Shoes > Women's Shoes > Women's Flats,Shoes > Women's Shoes > Loafers,Shoes > Women's Shoes > Women's Pumps",
    "Apparel Accessories > Women's Hair Accessories,Apparel Accessories > Girls' Accessories > Girls' Hair Accessories -,Mother & Kids > Girls' Baby Clothing > Accessories > Hair Accessories": \
        "Apparel Accessories > Women's Hair Accessories,Apparel Accessories > Girls' Accessories > Girls' Hair Accessories,Mother & Kids > Girls' Baby Clothing > Accessories > Hair Accessories",
    "Beauty & Health > Shaving & Hair Removal > Razor Blade,Beauty & Health > Shaving & Hair Removal > Razor ,nan": \
        "Beauty & Health > Shaving & Hair Removal > Razor Blade,Beauty & Health > Shaving & Hair Removal > Razor,nan",
    "Women's Clothing > Tops > Tees,Women's Clothing > Tops > Blouses & Shirts - ,Women's Clothing > Tops > Tank Tops": \
        "Women's Clothing > Tops > Tees,Women's Clothing > Tops > Blouses & Shirts,Women's Clothing > Tops > Tank Tops",
    "Home & garden > bathroom products > bath mats,Home & Garden > Home Textile > Carpets & Rugs > Mats,nan": \
        "Home & Garden > Bathroom Products > Bath Mats,Home & Garden > Home Textile > Carpets & Rugs > Mats,nan",
    "Tools > garden tools > garden power tools > chainsaws,Tools > Power Tools > Electric Saws,Tools > Power Tools": \
        "Tools > Garden Tools > Garden Power Tools > Chainsaws,Tools > Power Tools > Electric Saws,Tools > Power Tools",
    "Home improvement > home appliances > personal care appliances > electric hair brushes,Beauty & Health > Hair Care & Styling > Styling Tools > Combs,Beauty & Health > Hair Care & Styling > Styling Tools > Styling Accessories": \
        "Home Improvement > Home Appliances > Personal Care Appliances > Electric Hair Brushes,Beauty & Health > Hair Care & Styling > Styling Tools > Combs,Beauty & Health > Hair Care & Styling > Styling Tools > Styling Accessories",
    "Men's Clothing > Men's Exotic Apparel > Teddies & Bodysuits,Men's Clothing > Men's Exotic Apparel > Thongs,nan": \
        "Men's Clothing > Men's Exotic Apparel > Teddies & Bodysuits,Men's Clothing > Men's Exotic Apparel > G-Strings & Thongs,nan",
    "Novelty & Special Use > Costumes & Accessories > Women's Costumes > Sexy Costumes,Women's clothing > women's exotic apparel > lingerie sets,nan": \
        "Novelty & Special Use > Costumes & Accessories > Women's Costumes > Sexy Costumes,Women's Clothing > Women's Exotic Apparel > Lingerie Sets,nan",
    "Consumer Electronics > Camera & Photo > Photo Studio > Follow focus,Men's Clothing > Tops & Tees > T-Shirts,Home & Garden > Home Decor > Painting & Calligraphy": \
        "Consumer Electronics > Camera & Photo > Photo Studio > Follow Focus,Men's Clothing > Tops & Tees > T-Shirts,Home & Garden > Home Decor > Painting & Calligraphy",
    "Mother & Kids > Boys' Baby Clothing > Accessories > Hats & Caps,Mother & Kids > Girls' Baby Clothing > Accessories > Hats & Caps ,nan": \
        "Mother & Kids > Boys' Baby Clothing > Accessories > Hats & Caps,Mother & Kids > Girls' Baby Clothing > Accessories > Hats & Caps,nan",
    "Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan": \
        "Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan",
    "Apparel Accessories > Women's Hats > Women's Cowboy Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan": \
        "Apparel Accessories > Women's Hats > Women's Cowboy Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan",
    "Sports > Camping & Hiking > Tents & Shelters > Tents,Sports > camping & hiking > tents & shelters > sun shelter,nan": \
        "Sports > Camping & Hiking > Tents & Shelters > Tents,Sports > Camping & Hiking > Tents & Shelters > Sun Shelter,nan",
    "Women's clothing > underwear & sleepwear > women's sleepwear > pajama sets,Novelty & Special Use > Costumes & Accessories > Women's Costumes,Novelty & Special Use > Costumes & Accessories > Men's Costumes": \
        "Women's Clothing > Underwear & Sleepwear > Women's Sleepwear > Pajama Sets,Novelty & Special Use > Costumes & Accessories > Women's Costumes,Novelty & Special Use > Costumes & Accessories > Men's Costumes",
    "Apparel Accessories > Girls' Accessories > Girls' Scarves,Apparel Accessories > Men's Scarves,Apparel Accessories > Women's Scarves -": \
        "Apparel Accessories > Girls' Accessories > Girls' Scarves,Apparel Accessories > Men's Scarves,Apparel Accessories > Women's Scarves",
    "Cellphones & Telecommunications > iphones,nan,nan": "Cellphones & Telecommunications > iPhones,nan,nan", 
    "Apparel Accessories > Garment Fabrics & Accessories > Cotton Fabrics - ,nan,nan": \
        "Apparel Accessories > Garment Fabrics & Accessories > Cotton Fabrics,nan,nan", 
    "Men's Clothing > Hoodies & Sweatshirts,Women's Clothing > Tops > Tees,Women's Clothing > Dresses -": \
        "Men's Clothing > Hoodies & Sweatshirts,Women's Clothing > Tops > Tees,Women's Clothing > Dresses"
}
df_offshore['query_classifications'] = df_offshore.query_classifications.apply(lambda x: x.strip().replace('\n', '').replace('| ',',').replace('|',',')).apply(
    lambda x: manual_correction[x] if x in manual_correction else x)

In [137]:
df_offshore['query_classification_lists'] = df_offshore.query_classifications.apply(trie.extract_from_text)

ERROR:  Apparel Accessories > Girls' Accessories > Girls' Hats,Apparel Accessories > Women's Hats,Apparel Accessories > Girls' Accessories > Girls' Hats, Scarves & Gloves Sets []
ERROR:  Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan []
ERROR:  Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,Apparel Accessories > Men's Hats []
ERROR:  Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan []
ERROR:  Apparel Accessories > Women's Hats > Women's Cowboy Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan ["Apparel Accessories > Women's Hats > Women's Cowboy Hats"]
ERROR:  Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats, Scarves & Gloves Sets []


In [138]:
recs = []
for i in df_offshore.to_dict('records'):
    if i['query_classifications'] == "Apparel Accessories > Girls' Accessories > Girls' Hats,Apparel Accessories > Women's Hats,Apparel Accessories > Girls' Accessories > Girls' Hats, Scarves & Gloves Sets":
        i['query_classification_lists'] = [
            "Apparel Accessories > Girls' Accessories > Girls' Hats",
            "Apparel Accessories > Women's Hats",
            "Apparel Accessories > Girls' Accessories > Girls' Hats, Scarves & Gloves Sets"
        ]
    if i['query_classifications'] == "Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan":
        i['query_classification_lists'] = [
            "Apparel Accessories > Boys' Accessories > Boys' Hats",
            "Apparel Accessories > Girls' Accessories > Girls' Hats",
            "nan"
        ]
    if i['query_classifications'] == "Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,Apparel Accessories > Men's Hats":
        i['query_classification_lists'] = [
            "Apparel Accessories > Boys' Accessories > Boys' Hats",
            "Apparel Accessories > Girls' Accessories > Girls' Hats",
            "Apparel Accessories > Men's Hats"
        ]

    if i['query_classifications'] == "Apparel Accessories > Women's Hats > Women's Cowboy Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,nan":
        i['query_classification_lists'] = [
            "Apparel Accessories > Women's Hats > Women's Cowboy Hats",
            "Apparel Accessories > Girls' Accessories > Girls' Hats",
            "nan"
        ]

    if i['query_classifications'] == "Apparel Accessories > Boys' Accessories > Boys' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats,Apparel Accessories > Girls' Accessories > Girls' Hats, Scarves & Gloves Sets":
        i['query_classification_lists'] = [
            "Apparel Accessories > Boys' Accessories > Boys' Hats",
            "Apparel Accessories > Girls' Accessories > Girls' Hats",
            "Apparel Accessories > Girls' Accessories > Girls' Hats, Scarves & Gloves Sets"
        ]

    recs.append(i)
df_offshore = pd.DataFrame(recs)

In [139]:
path2id = {}
for i in df_tax.to_dict('records'):
    if len(i['category_path']) > 0:
        path2id[i['category_path'].lower().strip()] = i['id']
        path2id[i['category_path'].strip()] = i['id']

id2path = {}
for i in df_tax.to_dict('records'):
    id2path[i['id']] = i['category_path']

path2id['nan'] = -1
path2id['No Categories Match'] = -1
path2id['no_match'] = -1

In [140]:
df_offshore['query_classification_ids'] = df_offshore['query_classification_lists'].apply(lambda x: [path2id[i] for i in x])

In [141]:
appen_manual_correction = {
    "Women's Clothing > Weddings & Events > QuinceaÃ±era Dresses": "Women's Clothing > Weddings & Events > Quincea\u00f1era Dresses",
}

In [142]:
df_appen['query_classification_lists'] = df_appen['Final Answer'].apply(lambda x: x.split('\n'))
df_appen['query_classification_ids'] = df_appen['query_classification_lists'].apply(lambda x: \
    [path2id[appen_manual_correction[i] if i in appen_manual_correction else i] for i in x])

In [143]:
df_join = df_offshore[['query', 'query_classification_ids']].rename(columns={'query_classification_ids': 'offshore_query_classification_ids'}).merge( 
    df_appen[['query', 'query_classification_ids']].rename(columns={'query_classification_ids': 'appen_query_classification_ids'}), 
    on='query', how='inner'
)

In [144]:
len(df_appen), len(df_offshore), len(df_join)

(56188, 24095, 10563)

In [145]:
def convert_id_to_path(li):
    if set(li) == set([-1]):
        return []
    else:
        res = [] 
        for i in li:
            if int(i) in id2path:
                res.append(id2path[int(i)])
        return res

# export data

In [146]:
test_queries = set(df_offshore_test['query'])

In [147]:
len(test_queries)

4300

In [148]:
assert len(test_queries) == len(df_offshore_test)

In [149]:
df_offshore_nottest = df_offshore[df_offshore['query'].apply(lambda x: x not in test_queries)]

In [150]:
len(df_offshore_nottest)

19795

In [151]:
df_offshore_nottest = df_offshore_nottest[['label_ordering', 'query', 'sample_method', 'gmv', 'cnt', 'query_classifications', 'query_classification_lists', 'query_classification_ids']]

In [152]:
df_offshore_val = df_offshore_nottest.sample(5000, random_state=42)
val_queries = set(df_offshore_val['query'])
df_offshore_train = df_offshore_nottest[df_offshore_nottest['query'].apply(lambda x: x not in val_queries)]

In [153]:
df_offshore_val.to_json('./processed/Offshore_Labelled_Query_Classification_Val.json', lines=True, orient='records')

In [154]:
df_offshore_train.to_json('./processed/Offshore_Labelled_Query_Classification_Train_01112023.json', lines=True, orient='records')

In [155]:
train_queries = set(df_offshore_train['query'])

In [156]:
appen_queries = set(df_appen['query'])

In [157]:
len(set(df_offshore['query']).intersection(set(df_offshore_test['query'])))

4300

In [158]:
len(set(df_offshore_test['query']))

4300

In [159]:
len(appen_queries.intersection(set(df_offshore['query'])))

10563

In [160]:
offshore_queries = set(df_offshore['query'])

In [161]:
len(test_queries) + len(val_queries) + len(train_queries) == len(offshore_queries)

True

In [162]:
df_appen_notoverlap = df_appen[df_appen['query'].apply(lambda x: x not in offshore_queries)]

In [163]:
len(df_appen_notoverlap), len(df_appen)

(45625, 56188)

In [182]:
df_appen_notoverlap = df_appen_notoverlap[['label_ordering', 'query', 'sample_method', 'gmv', 'cnt', 'query_classification_lists', 'query_classification_ids']]

In [186]:
df_appen_notoverlap.query_classification_ids.apply(len).value_counts()

1    30413
2    11250
3     3950
4       12
Name: query_classification_ids, dtype: int64

In [188]:
df_appen_notoverlap.to_json('Appen_Labelled_Query_Classification_Train_NoOverlapOffshore_01112023.json', lines=True, orient='records')

# calculate performance

In [165]:
df_join['offshore_query_classification_lists'] = df_join['offshore_query_classification_ids'].apply(convert_id_to_path).apply(tuple)
df_join['appen_query_classification_lists'] = df_join['appen_query_classification_ids'].apply(convert_id_to_path).apply(tuple)

In [166]:
len(df_join)

10563

In [167]:
from sklearn.metrics import accuracy_score, classification_report

In [168]:
accuracy_score(
    y_true=df_join['offshore_query_classification_lists'].apply(lambda x: '|'.join(x)),
    y_pred=df_join['appen_query_classification_lists'].apply(lambda x: '|'.join(x))
)

0.28798636750923035

In [169]:
lab_offshore = np.zeros((len(df_join), len(path2id)))
lab_appen = np.zeros((len(df_join), len(path2id)))

for ind, i in enumerate(df_join.to_dict('records')):
    for j in i['offshore_query_classification_ids']:
        if j != -1:
            lab_offshore[ind][j-1] = 1. 
    for j in i['appen_query_classification_ids']:
        if j != -1:
            lab_appen[ind][j-1] = 1. 
    
df_metrics = pd.DataFrame(classification_report(y_true=lab_offshore, y_pred=lab_appen, zero_division=0, output_dict=True)).T
df_metrics

Unnamed: 0,precision,recall,f1-score,support
0,0.000000,0.000000,0.000000,0.0
1,0.000000,0.000000,0.000000,0.0
2,0.000000,0.000000,0.000000,1.0
3,0.500000,0.200000,0.285714,5.0
4,0.000000,0.000000,0.000000,3.0
...,...,...,...,...
12076,0.000000,0.000000,0.000000,0.0
micro avg,0.444352,0.355157,0.394779,18054.0
macro avg,0.114671,0.093631,0.093654,18054.0
weighted avg,0.542927,0.355157,0.390516,18054.0


In [170]:
df_join.offshore_query_classification_ids.apply(lambda x: len([i for i in x if i != -1])).value_counts()

1    5355
3    3048
2    1786
0     374
Name: offshore_query_classification_ids, dtype: int64

In [171]:
df_join.appen_query_classification_ids.apply(lambda x: len([i for i in x if i != -1])).value_counts()

1    6783
2    2522
3     865
0     391
4       2
Name: appen_query_classification_ids, dtype: int64

In [172]:
df_join[df_join.appen_query_classification_ids.apply(len) == 4].to_dict('records')

[{'query': 'mens gifts for birthday',
  'offshore_query_classification_ids': [5972, 4151, 1549],
  'appen_query_classification_ids': [4166, 40, 4741, 5972],
  'offshore_query_classification_lists': ("Watches > Men's Watches",
   "Luggage & Bags > Men's Bags > Wallets",
   'Consumer Electronics > Camera & Photo > Camera & Photo Accessories > Digital Photo Frames'),
  'appen_query_classification_lists': ("Men's Clothing",
   "Apparel Accessories > Men's Accessories",
   "Shoes > Men's Shoes",
   "Watches > Men's Watches")},
 {'query': 'bad bitch',
  'offshore_query_classification_ids': [2986, 3949, 1861],
  'appen_query_classification_ids': [5787, 6029, 6049, 4111],
  'offshore_query_classification_lists': ('Home & Garden > Kitchen,Dining & Bar > Coffeeware > Coffee Cups & Mugs',
   'Home Improvement > Lights & Lighting > Light Bulbs > Neon Bulbs & Tubes',
   'Education & Office Supplies > Books & Magazines > Life & Lifestyle'),
  'appen_query_classification_lists': ('Toys & Hobbies > Ho

In [173]:
df_all_metrics = []
for constraint in range(1,6):
    tmp = df_join.copy()
    tmp['offshore_query_classification_lists'] = tmp['offshore_query_classification_lists'].apply(
        lambda x: tuple([' > '.join(i.split(' > ')[:constraint]) for i in x]))
    tmp['appen_query_classification_lists'] = tmp['appen_query_classification_lists'].apply(
        lambda x: tuple([' > '.join(i.split(' > ')[:constraint]) for i in x]))
    tmp['offshore_query_classification_ids'] = tmp['offshore_query_classification_lists'].apply(lambda x: [path2id[i] for i in x])
    tmp['appen_query_classification_ids'] = tmp['appen_query_classification_lists'].apply(lambda x: [path2id[i] for i in x])
    
    lab_offshore = np.zeros((len(tmp), max(df_tax['id'])))
    lab_appen = np.zeros((len(tmp), max(df_tax['id'])))
    for ind, i in enumerate(tmp.to_dict('records')):
        for j in i['offshore_query_classification_ids']:
            if j != -1:
                lab_offshore[ind][j-1] = 1. 
        for j in i['appen_query_classification_ids']:
            if j != -1:
                lab_appen[ind][j-1] = 1. 
    
    df_metrics_tmp = pd.DataFrame(classification_report(y_true=lab_offshore, y_pred=lab_appen, zero_division=0, output_dict=True)).T
    df_metrics_tmp['depth_constraint'] = constraint
    df_all_metrics.append(df_metrics_tmp)
df_all_metrics = pd.concat(df_all_metrics)

In [174]:
df_all_metrics[df_all_metrics.index == 'weighted avg']

Unnamed: 0,precision,recall,f1-score,support,depth_constraint
weighted avg,0.763391,0.72841,0.742785,12390.0,1
weighted avg,0.656837,0.580877,0.601527,14349.0,2
weighted avg,0.581831,0.41966,0.452803,16897.0,3
weighted avg,0.543784,0.355792,0.391239,18033.0,4
weighted avg,0.542927,0.355157,0.390516,18054.0,5
