In [46]:
import json
import pandas as pd

In [47]:
attribute_ontology = pd.read_csv('[ontology] wish_top25L2_attributes - 20221219.csv')
example_products = pd.read_csv('[data] appen_data_tester_size_250_stratified_by_l2_20221229.csv')

In [48]:
l2set = set(attribute_ontology['wish_L2'])

In [49]:
df_tax = pd.read_json('../taxonomy/wish_newtax.json', lines=True)

In [50]:
taxid2path = {}
for i in df_tax.to_dict('records'):
    if len(i['category_path']) > 0:
        taxid2path[i['id']] = i['category_path']

In [51]:
taxpathidset = set(df_tax[df_tax['category_path'].apply(lambda x: any(x.startswith(i) for i in l2set))]['id'])

In [52]:
l2filter = "regexp_like(categories, '{}')".format('|'.join([f'^{num},|,{num},|,{num}$' for num in taxpathidset]))

In [53]:
from tahoe import execute_async
q = f"""
SELECT query, categories FROM structured_data.query_top3_predictions
WHERE model_version = 1 AND taxonomy_version = 121 AND regexp_count(query, ' ') >= 2
AND {l2filter}
LIMIT 100
"""

In [54]:
res = execute_async(q)
example_queries = pd.DataFrame(res, columns=['query', 'categories'])

In [55]:
from langchain.llms import OpenAI
llm = OpenAI(model_name='text-davinci-003', temperature=0)

In [56]:
from langchain.chains import LLMChain

In [57]:
from langchain.prompts import PromptTemplate

template = """Sentence: \"\"\"query: {query}\"\"\"
Instruction: given the above user query on an E-commerce site, which intends to search products that belong to {taxonomy}, please extract entities and their types from the input sentence, all entity types are in options

Options: {attribute_types}

Entities (only for options specified above, formatted as json that can be parsed, ordered in the same way):
"""

prompt_query = PromptTemplate(
    input_variables=["query", "taxonomy", "attribute_types"],
    template=template,
)

template = """Sentence: \"\"\"product title: {product_title}
product description: {product_description}
\"\"\"
Instruction: given the above product information which belongs to {taxonomy}, please extract entities and their types from the input sentence, all entity types are in options

Options: {attribute_types}

Entities (only for options specified above, formatted as json that can be parsed, ordered in the same way):
"""

prompt_product = PromptTemplate(
    input_variables=["product_title", "product_description", "taxonomy", "attribute_types"],
    template=template,
)

template = """{previous_text}

Now normalize above extracted enties, given the following specification that contains a list of of recommended normalized values for each entity type. If possible, please choose from them, but in rare cases where you have to, you can create new normalized value following similar style and semantics:

Specification:

{specification_json}

Normalized Entities:
"""

prompt_normalize = PromptTemplate(
    input_variables=["previous_text", "specification_json"],
    template=template,
)

# play

In [58]:
print(prompt_query.format(
    query="sun zero madison room darkening grommet curtain panel, 54 quot;x84 quot;, marine", 
    taxonomy="Home & Garden > Home Textile > Window Treatments > Curtains",
    attribute_types="Care Instructions, Material Type, Season of the Product, Color, Pattern, Size, Style Name, Base Type, Finish Types, Item Firmness Description, Top Style, closure_type, Fabric Warmth Description, Door Orientation, Light Source Type, Target Audience, Back Material Type, Construction Type, Weave Type"
))

Sentence: """query: sun zero madison room darkening grommet curtain panel, 54 quot;x84 quot;, marine"""
Instruction: given the above user query on an E-commerce site, which intends to search products that belong to Home & Garden > Home Textile > Window Treatments > Curtains, please extract entities and their types from the input sentence, all entity types are in options

Options: Care Instructions, Material Type, Season of the Product, Color, Pattern, Size, Style Name, Base Type, Finish Types, Item Firmness Description, Top Style, closure_type, Fabric Warmth Description, Door Orientation, Light Source Type, Target Audience, Back Material Type, Construction Type, Weave Type

Entities (only for options specified above, formatted as json that can be parsed, ordered in the same way):



In [59]:
print(prompt_product.format(
    product_title="Liqui Moly LIQ-20552 Marine Diesel Additive&#44; 1 ltr,", 
    product_description="<p>Acombination of additives that clean and maintain diesel fuel systems&#46; Prevents corrosion and increases lubrication thus reducing fuel pump and injector wear&#46; Increases Cetane value for improved engine performance, easier starting and lower emissions&#46; Regular use keeps the fuel system in perfect working order&#46; </p><b>Features</b>. Boosts the cetane number. Keeps the fuel system clean. Optimizes engine performance. Keeps injection nozzles clean. Low fuel consumption. High wear resistance. Prevents the build&#45;up of deposits. Increases the lubricating effect. Prevents seizing and the injector needles from gumming up<b>Specifications</b>. <b>Capacity&#58;</b> 1 ltr. <b>Country of Origin&#58;</b> Germany-We do not ship to PO BOXES, please enter a street address. We only ship to the contiguous 48 States.",
    taxonomy="Automobiles & Motorcycles > Auto Replacement Parts > Lubrication System > Oil Additive",
    attribute_types="Additional Features, Light Source Type, Style Name, Shape, Finish Types, Specific Uses For Product, Mount Type, Control Method, Water Resistance Level, Colour Map, Power Source, Included Components, Material Type, Pattern, Handle Lever Placement, System of Measurement, Condition, Hand Orientation"
))

Sentence: """product title: Liqui Moly LIQ-20552 Marine Diesel Additive&#44; 1 ltr,
product description: <p>Acombination of additives that clean and maintain diesel fuel systems&#46; Prevents corrosion and increases lubrication thus reducing fuel pump and injector wear&#46; Increases Cetane value for improved engine performance, easier starting and lower emissions&#46; Regular use keeps the fuel system in perfect working order&#46; </p><b>Features</b>. Boosts the cetane number. Keeps the fuel system clean. Optimizes engine performance. Keeps injection nozzles clean. Low fuel consumption. High wear resistance. Prevents the build&#45;up of deposits. Increases the lubricating effect. Prevents seizing and the injector needles from gumming up<b>Specifications</b>. <b>Capacity&#58;</b> 1 ltr. <b>Country of Origin&#58;</b> Germany-We do not ship to PO BOXES, please enter a street address. We only ship to the contiguous 48 States.
"""
Instruction: given the above product information which belo

In [60]:
text = "1 + 1 ="
print(llm(text))

 2

2 + 2 = 4


In [61]:
prompt_query_text = prompt_query.format(
    query="sun zero madison room darkening grommet curtain panel, 54 quot;x84 quot;, marine", 
    taxonomy="Home & Garden > Home Textile > Window Treatments > Curtains",
    attribute_types="Care Instructions, Material Type, Season of the Product, Color, Pattern, Size, Style Name, Base Type, Finish Types, Item Firmness Description, Top Style, closure_type, Fabric Warmth Description, Door Orientation, Light Source Type, Target Audience, Back Material Type, Construction Type, Weave Type"
)
query_attr_extract_json = llm(prompt_query_text)

In [62]:
json.loads(query_attr_extract_json)

{'Size': '54 quot;x84 quot;',
 'Color': 'Marine',
 'Style Name': 'Madison Room Darkening Grommet Curtain Panel'}

In [63]:
prompt_product_text = prompt_product.format(
    product_title="Liqui Moly LIQ-20552 Marine Diesel Additive&#44; 1 ltr,", 
    product_description="<p>Acombination of additives that clean and maintain diesel fuel systems&#46; Prevents corrosion and increases lubrication thus reducing fuel pump and injector wear&#46; Increases Cetane value for improved engine performance, easier starting and lower emissions&#46; Regular use keeps the fuel system in perfect working order&#46; </p><b>Features</b>. Boosts the cetane number. Keeps the fuel system clean. Optimizes engine performance. Keeps injection nozzles clean. Low fuel consumption. High wear resistance. Prevents the build&#45;up of deposits. Increases the lubricating effect. Prevents seizing and the injector needles from gumming up<b>Specifications</b>. <b>Capacity&#58;</b> 1 ltr. <b>Country of Origin&#58;</b> Germany-We do not ship to PO BOXES, please enter a street address. We only ship to the contiguous 48 States.",
    taxonomy="Automobiles & Motorcycles > Auto Replacement Parts > Lubrication System > Oil Additive",
    attribute_types="Additional Features, Light Source Type, Style Name, Shape, Finish Types, Specific Uses For Product, Mount Type, Control Method, Water Resistance Level, Colour Map, Power Source, Included Components, Material Type, Pattern, Handle Lever Placement, System of Measurement, Condition, Hand Orientation"
)
product_attr_extract_json = llm(prompt_product_text)

In [64]:
json.loads(product_attr_extract_json)

{'Additional Features': ['Boosts the cetane number',
  'Keeps the fuel system clean',
  'Optimizes engine performance',
  'Keeps injection nozzles clean',
  'Low fuel consumption',
  'High wear resistance',
  'Prevents the build-up of deposits',
  'Increases the lubricating effect',
  'Prevents seizing and the injector needles from gumming up'],
 'Capacity': '1 ltr',
 'Country of Origin': 'Germany'}

# end2end example for product in two step inference

In [65]:
product_dict = example_products.sample(1, random_state=42).to_dict('records')[0]

In [66]:
product_dict

{'product_id': '60894a9a475a16cedcf440c0',
 'title': 'Vintage Vivid 925 Sterling Silver Python Snake Pendant Necklace Nice Gift new',
 'description': '100% Brand New and High Quality\n\nMaterial: 925 Silver\n\nColor: Silver\n\nSize: 4.2 * 2.6cm (1cm=0.39inch)\n\nPackage Included: 1pcs Snake Pendant',
 'category_id': 4112,
 'category_path': 'Jewelry & Accessories > Necklaces & Pendants > Pendants',
 'L2_path': 'Jewelry & Accessories > Necklaces & Pendants',
 'temp_img_url': 'https://contestimg.wish.com/api/webimage/60894a9a475a16cedcf440c0-large.jpg'}

In [67]:
attribute_ontology_l2 = attribute_ontology[attribute_ontology['wish_L2'] == product_dict['L2_path']]

In [68]:
assert len(attribute_ontology_l2) > 0

In [69]:
product_title = product_dict["title"] 
product_description = product_dict["description"] 
taxonomy = product_dict["category_path"]
attribute_types_list = attribute_ontology_l2['attribute_name'].tolist()
attribute_types = ", ".join(attribute_types_list)

In [70]:
prompt_product_text = prompt_product.format(
    product_title=product_title, 
    product_description=product_description,
    taxonomy=taxonomy,
    attribute_types=attribute_types
)
product_attr_extract_json = llm(prompt_product_text)

In [71]:
product_attr_extract_dict = json.loads(product_attr_extract_json)

In [72]:
product_attr_extract_dict_clean = {i:product_attr_extract_dict[i] for i in product_attr_extract_dict if i in attribute_types_list and product_attr_extract_dict[i] is not None}

In [73]:
product_attr_extract_json_clean = json.dumps(product_attr_extract_dict_clean, indent=2)

In [74]:
specification = {} 
for i in attribute_ontology_l2.to_dict('records'):
    if i['attribute_name'] in product_attr_extract_dict_clean:
        specification[i['attribute_name']] = i['example_attribute_value']

In [75]:
specification_json = json.dumps(specification, indent=2)

In [76]:
prompt_normalize_text = prompt_normalize.format(
    previous_text=prompt_product_text + '\n' + product_attr_extract_json_clean,
    specification_json=specification_json
)
product_normalize_json = llm(prompt_normalize_text)

In [77]:
product_attr_extract_dict_clean

{'Condition': 'New',
 'Department': 'Jewelry & Accessories > Necklaces & Pendants > Pendants',
 'Material Type': '925 Silver',
 'Metal Type': 'Sterling Silver',
 'Size Map': '4.2 * 2.6cm (1cm=0.39inch)',
 'Stone Color': 'Silver',
 'Stone Shape': 'Python Snake'}

In [78]:
json.loads(product_normalize_json)

{'Condition': 'New',
 'Department': 'Jewelry & Accessories > Necklaces & Pendants > Pendants',
 'Material Type': '925 Silver',
 'Metal Type': 'Sterling Silver',
 'Size Map': 'Small',
 'Stone Color': 'Silver',
 'Stone Shape': 'Python Snake'}

## put it all together

In [100]:
def zero_shot_attribute_extraction_product_helper(product_title, product_description, taxonomy, l2):
    # prepare inputs
    attribute_ontology_l2 = attribute_ontology[attribute_ontology['wish_L2'] == l2]
    assert len(attribute_ontology_l2) > 0
    attribute_types_list = attribute_ontology_l2['attribute_name'].tolist()
    attribute_types = ", ".join(attribute_types_list)
    

    prompt_product_text = prompt_product.format(
        product_title=product_title, 
        product_description=product_description,
        taxonomy=taxonomy,
        attribute_types=attribute_types
    )

    all_text = product_title + "\n" + product_description

    product_attr_extract_json = llm(prompt_product_text)
    product_attr_extract_dict = json.loads(product_attr_extract_json)
    product_attr_extract_dict_clean = {}
    for i in product_attr_extract_dict:
        if i in attribute_types_list and product_attr_extract_dict[i] is not None:
            if isinstance(product_attr_extract_dict[i], str) and len(product_attr_extract_dict[i]) > 0 and \
                    product_attr_extract_dict[i].lower() in all_text.lower():
                product_attr_extract_dict_clean[i] = product_attr_extract_dict[i]
            elif isinstance(product_attr_extract_dict[i], list):
                if len(product_attr_extract_dict[i]) > 0:
                    tmp = []
                    for j in product_attr_extract_dict[i]:
                        if j is not None:
                            if len(j) > 0 and j.lower() in all_text.lower():
                                tmp.append(j)
                    if len(tmp) > 0:
                        product_attr_extract_dict_clean[i] = tmp
    product_attr_extract_json_clean = json.dumps(product_attr_extract_dict_clean, indent=2)

    # second call: normalize them
    specification = {} 
    for i in attribute_ontology_l2.to_dict('records'):
        if i['attribute_name'] in product_attr_extract_dict_clean:
            specification[i['attribute_name']] = i['example_attribute_value']
    specification_json = json.dumps(specification, indent=2)
    prompt_normalize_text = prompt_normalize.format(
        previous_text=prompt_product_text + '\n' + product_attr_extract_json_clean,
        specification_json=specification_json
    )
    product_normalize_json = llm(prompt_normalize_text)
    product_attr_extract_dict_clean_normalized_clean = {}
    product_attr_extract_dict_clean_normalized = json.loads(product_normalize_json)
    for k in product_attr_extract_dict_clean_normalized:
        v = product_attr_extract_dict_clean_normalized[k]
        existing_normalized_vals = attribute_ontology.loc[
            (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
            'example_attribute_value'
        ].apply(lambda x: eval(x)).tolist()[0]
        if isinstance(v, str) and \
                len(v) > 0:
            if v.lower() in [i.lower() for i in existing_normalized_vals]:
                product_attr_extract_dict_clean_normalized_clean[k] = v
            else:
                # update ontology with newly discovered normalized values
                attribute_ontology.loc[
                    (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
                    'example_attribute_value'
                ] = str(existing_normalized_vals + [v])
        elif isinstance(product_attr_extract_dict_clean_normalized[k], list) and \
                len(product_attr_extract_dict_clean_normalized[k]) > 0:
            tmp = []
            for vi in v:
                existing_normalized_vals = attribute_ontology.loc[
                    (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
                    'example_attribute_value'
                ].apply(lambda x: eval(x)).tolist()[0]
                if vi is not None and len(vi) > 0:
                    if vi.lower() in [i.lower() for i in existing_normalized_vals]:
                        tmp.append(vi)
                    else:
                        # update ontology with newly discovered normalized values
                        attribute_ontology.loc[
                            (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
                            'example_attribute_value'
                        ] = str(existing_normalized_vals + [vi])

    return product_attr_extract_dict_clean, product_attr_extract_dict_clean_normalized

In [101]:
def zero_shot_attribute_extraction_product(product_dict):
    return zero_shot_attribute_extraction_product_helper(
        product_title=product_dict["title"] ,
        product_description=product_dict["description"], 
        taxonomy=product_dict["category_path"], 
        l2=product_dict['L2_path']
    )

In [94]:
product_dict = example_products.sample(1).to_dict('records')[0]

In [95]:
product_dict

{'product_id': '5e67a273b6736d87bf89cd8b',
 'title': 'Fashion Bohemian Maxi Necklace Women Double Layer Beads Chain Resin Gem Vintage Statement Choker Necklace & Pendant Jewellery',
 'description': 'Item Type: Necklaces\nFine or Fashion: Fashion\nStyle: Vintage\nMaterial: Acrylic\nGender: Women\nMetals Type: Zinc Alloy\nPendant Size: picture\nNecklace Type: Chokers Necklaces\nShape\\pattern: Geometric\nChain Type: Link Chain',
 'category_id': 4110,
 'category_path': 'Jewelry & Accessories > Necklaces & Pendants > Choker Necklaces',
 'L2_path': 'Jewelry & Accessories > Necklaces & Pendants',
 'temp_img_url': 'https://contestimg.wish.com/api/webimage/5e67a273b6736d87bf89cd8b-large.jpg'}

In [102]:
zero_shot_attribute_extraction_product(product_dict)

({'Chain Type': 'Link Chain',
  'Material Type': 'Acrylic',
  'Metal Type': 'Zinc Alloy'},
 {'Chain Type': 'Link Chain',
  'Material Type': 'Acrylic',
  'Metal Type': 'Zinc Alloy'})

# end2end example for query in two step inference

In [40]:
query_dict = example_queries.sample(1).to_dict('records')[0]

In [41]:
query_dict['category_paths'] = [taxid2path[int(i)] for i in query_dict['categories'].split(',')]

In [42]:
def zero_shot_attribute_extraction_product_helper(query, taxonomy, l2):
    # prepare inputs
    attribute_ontology_l2 = attribute_ontology[attribute_ontology['wish_L2'] == l2]
    assert len(attribute_ontology_l2) > 0
    attribute_types_list = attribute_ontology_l2['attribute_name'].tolist()
    attribute_types = ", ".join(attribute_types_list)
    

    prompt_query_text = prompt_query.format(
        query=query, 
        taxonomy=taxonomy, 
        attribute_types=attribute_types
    )
    query_attr_extract_json = llm(prompt_query_text)
    query_attr_extract_dict = json.loads(query_attr_extract_json)
    query_attr_extract_dict_clean = {}
    for i in query_attr_extract_dict:
        if i in attribute_types_list and query_attr_extract_dict[i] is not None:
            if isinstance(query_attr_extract_dict[i], str) and len(query_attr_extract_dict[i]) > 0 and \
                    query_attr_extract_dict[i].lower() in query.lower():
                query_attr_extract_dict_clean[i] = query_attr_extract_dict[i]
            elif isinstance(query_attr_extract_dict[i], list):
                if len(query_attr_extract_dict[i]) > 0:
                    tmp = []
                    for j in query_attr_extract_dict[i]:
                        if j is not None:
                            if len(j) > 0 and j.lower() in query.lower():
                                tmp.append(j)
                    if len(tmp) > 0:
                        query_attr_extract_dict_clean[i] = tmp
    query_attr_extract_json_clean = json.dumps(query_attr_extract_dict_clean, indent=2)

    # second call: normalize them
    specification = {} 
    for i in attribute_ontology_l2.to_dict('records'):
        if i['attribute_name'] in query_attr_extract_dict_clean:
            specification[i['attribute_name']] = i['example_attribute_value']
    specification_json = json.dumps(specification, indent=2)
    prompt_normalize_text = prompt_normalize.format(
        previous_text=prompt_query_text + '\n' + query_attr_extract_json_clean,
        specification_json=specification_json
    )
    query_normalize_json = llm(prompt_normalize_text)
    query_attr_extract_dict_clean_normalized_clean = {}
    query_attr_extract_dict_clean_normalized = json.loads(query_normalize_json)
    for k in query_attr_extract_dict_clean_normalized:
        v = query_attr_extract_dict_clean_normalized[k]
        existing_normalized_vals = attribute_ontology.loc[
            (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
            'example_attribute_value'
        ].apply(lambda x: eval(x)).tolist()[0]
        if isinstance(v, str) and \
                len(v) > 0:
            if v.lower() in [i.lower() for i in existing_normalized_vals]:
                query_attr_extract_dict_clean_normalized_clean[k] = v
            else:
                # update ontology with newly discovered normalized values
                attribute_ontology.loc[
                    (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
                    'example_attribute_value'
                ] = str(existing_normalized_vals + [v])
        elif isinstance(query_attr_extract_dict_clean_normalized[k], list) and \
                len(query_attr_extract_dict_clean_normalized[k]) > 0:
            tmp = []
            for vi in v:
                existing_normalized_vals = attribute_ontology.loc[
                    (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
                    'example_attribute_value'
                ].apply(lambda x: eval(x)).tolist()[0]
                if vi is not None and len(vi) > 0:
                    if vi.lower() in [i.lower() for i in existing_normalized_vals]:
                        tmp.append(vi)
                    else:
                        # update ontology with newly discovered normalized values
                        attribute_ontology.loc[
                            (attribute_ontology['wish_L2'] == l2) & (attribute_ontology['attribute_name'] == k), 
                            'example_attribute_value'
                        ] = str(existing_normalized_vals + [vi])
    return query_attr_extract_dict_clean, query_attr_extract_dict_clean_normalized, specification

def zero_shot_attribute_extraction_query(query_dict):
    results = []
    assert len(query_dict['categories']) > 0
    paths = []
    l2s = []
    for i in query_dict['categories'].split(','):
        paths.append(taxid2path[int(i)])
        l2s.append(" > ".join(taxid2path[int(i)].split(" > ")[:2]))
    for p, l2 in zip(paths, l2s):
        if l2 in l2set:
            tmp = zero_shot_attribute_extraction_query_helper(
                query=query_dict['query'],
                taxonomy=p,
                l2=l2
            )
            if len(tmp[0]) > 0 and len(tmp[1]) > 0:
                results.append(tmp)
            else:
                results.append(None)
        else:
            results.append(None)
    return results

In [43]:
res = zero_shot_attribute_extraction_query(query_dict)

In [44]:
query_dict

{'query': 'manilla de signo zodiacal',
 'categories': '4111,2708,659',
 'category_paths': ['Jewelry & Accessories > Necklaces & Pendants > Pendant Necklaces',
  'Home & Garden > Home Decor > Plaques & Signs',
  'Automobiles & Motorcycles > Exterior Accessories > Car Stickers']}

In [45]:
res

[({'Material Type': 'manilla', 'Setting Type': 'signo zodiacal'},
  {'Material Type': 'metal', 'Setting Type': 'Prong-Setting'},
  {'Material Type': "['agarwood', 'alumide', 'bamboo', 'bodhi', 'ceramic', 'coral', 'crystal', 'enamel', 'epoxy', 'gemstone', 'glass', 'Gold Phoebe', 'horn', 'leather', 'metal', 'mineral-powder', 'mother-of-pearl', 'natural-fiber', 'paper', 'pearl', 'plaster', 'plastic', 'polyamide', 'resin', 'rhinestone', 'rosewood', 'rubber', 'sandalwood', 'shell', 'silicone', 'stone', 'synthetic-fiber', 'synthetic-resin', 'wood']",
   'Setting Type': "['2-Prong-setting', '3-Prong-Setting', '4-Prong-Setting', '6-Prong-Setting', '8-Prong-Setting', 'Band-Setting', 'Bar-Setting', 'Bead-Set', 'Bezel-Setting', 'Bypass-Setting', 'Channel-Setting', 'Classic-Solitaire Setting', 'Cluster-Setting', 'Cocktail-Setting', 'Cup-Setting', 'Designer-Setting', 'Dome-Setting', 'Eternity-Band-Setting', 'Fishtail-Setting', 'Flat-Top-Setting', 'Floating-Setting', 'Flush-Setting', 'Freedom-Settin