In [1]:
import os
import sys
sys.path.append(os.getcwd() + '/..')
sys.path.append(os.getcwd() + '/../..')
import re
import types
from tqdm.notebook import tqdm
from typing import List, Union, Tuple, Callable
import pandas as pd
import numpy as np
import owlready2 as o2
from utils import breadcrumb as bc

In [2]:
with open ('./taxonomy-with-ids.en-US.txt') as inf:
    f = inf.readlines()
reID = re.compile(r'^(\d+) -.*$')
rePath = re.compile(r'^\d+ - (.*)\n$')


In [3]:
label_id_map = {}
id_label_map = {}
categories = {'catid':[],'strpath':[],'label':[]}
for line in f:
    catid = int(re.findall(reID,line)[0])
    strpath = re.findall(rePath,line)[0].split(' > ')
    categories['catid'].append(catid)
    categories['strpath'].append(strpath)
    label = categories['strpath'][-1][-1]
    categories['label'].append(label)
    label_id_map[label] = catid
    id_label_map[catid] = label


In [4]:
def label_2_catid(label):
    return label_id_map[label]

def catid_2_label(catid):
    return id_label_map[catid]

def strpath_2_idpath(path):
    return list(map(label_2_catid,path))

In [5]:
categories['path'] = list(map(strpath_2_idpath,categories['strpath']))
df = pd.DataFrame(categories)
df = df.set_index('catid')
df

Unnamed: 0_level_0,strpath,label,path
catid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,[Animals & Pet Supplies],Animals & Pet Supplies,[1]
3237,"[Animals & Pet Supplies, Live Animals]",Live Animals,"[1, 3237]"
2,"[Animals & Pet Supplies, Pet Supplies]",Pet Supplies,"[1, 2]"
3,"[Animals & Pet Supplies, Pet Supplies, Bird Su...",Bird Supplies,"[1, 2, 3]"
7385,"[Animals & Pet Supplies, Pet Supplies, Bird Su...",Bird Cage Accessories,"[1, 2, 3, 7385]"
...,...,...,...
3540,"[Vehicles & Parts, Vehicles, Watercraft]",Watercraft,"[888, 5614, 3540]"
3095,"[Vehicles & Parts, Vehicles, Watercraft, Motor...",Motor Boats,"[888, 5614, 3540, 3095]"
1130,"[Vehicles & Parts, Vehicles, Watercraft, Perso...",Personal Watercraft,"[888, 5614, 3540, 1130]"
3087,"[Vehicles & Parts, Vehicles, Watercraft, Sailb...",Sailboats,"[888, 5614, 3540, 3087]"


In [7]:
ontology = o2.get_ontology("http://ebay.com/google_pt_taxonomy/20231030.owl")

def create_new_class(data, onto, catid):
    category = data.loc[catid]
    catid = category.name
    label = category['label']
    path = category['path']
    l1 = category['strpath'][0]
    strpath = l1 + f' ({str(label_2_catid(l1))})'
    if len(category['strpath']) > 1:
        for cat in category['strpath'][1:]:
            strpath += f' > {cat} ({str(label_2_catid(cat))})'
    with onto:
        if len(path) == 1:
            parentclass = o2.Thing
        else:
            parent = path[-2]
            parentclass = onto[str(parent)]
        if not parentclass:
            parentclass = create_new_class(data, onto, parent)
        NewClass = types.new_class(str(catid), (parentclass,))
        NewClass.label = str(label)
        NewClass.catid = int(catid)
        NewClass.path = strpath
    return NewClass

with ontology:
    class catid(o2.AnnotationProperty):
        pass

    class path(o2.AnnotationProperty):
        pass

with tqdm(total = len(df)) as pbar:
    for i,row in df.iterrows():
        if not ontology[str(row.name)]:
            create_new_class(df, ontology, row.name)
        pbar.update(1)

  0%|          | 0/5595 [00:00<?, ?it/s]

In [9]:
ontology.save(file = "./google_pt_taxonomy_1030.owl")

In [2]:
df = pd.read_csv('./2023_May_SR_Structures.txt', sep='\t')
cfn = pd.concat([pd.read_csv('./Category_Friendly_Name_Full_Data_data.csv'),pd.read_csv('./Category_Friendly_Name_data (1).csv')])

In [3]:
df_usmo = df[df['site'].isin([0,100])]
df_usmo = df_usmo[['Cat ID','Breadcrumb']]
df_usmo = df_usmo.set_index('Cat ID')

In [4]:
cfn = cfn[['Cat ID','Category Friendly Name']]
cfn = cfn.set_index('Cat ID')

In [7]:
data = pd.concat([df_usmo,cfn],axis=1).dropna()
data['Path'] = data['Breadcrumb'].apply(lambda x: bc.normalise_breadcrumb(x, link='id'))

In [8]:
data = data.rename(columns={'Category Friendly Name': 'Label'})

In [12]:
data.drop('Breadcrumb',axis=1).to_csv('./Cat_data.csv')