In [1]:
from tahoe import create_external_table, drop_external_table, execute_async
from s3 import result_bucket, get_s3_file_keys, get_df_from_parquet
import pandas as pd
import logging
import dvc.api
logging.basicConfig(
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO
)

# Download

In [12]:
from l2s import commited_l2s, at_risk_l2s, very_at_risk_l2s

In [13]:
select_l2_set = set(commited_l2s).union(set(at_risk_l2s))

In [26]:
df_tax = pd.read_json("/workspaces/multitask-llm-rnd/datasets/data/taxonomy/wish_newtax.json", lines=True)

In [27]:
df_tax_latest = pd.read_json("/workspaces/multitask-llm-rnd/datasets/data/taxonomy/wish_newtax_02222023.json", lines=True)

In [28]:
df_tax_select_l2s = df_tax[df_tax.category_path.apply(lambda x: x.count(' > ') == 1 and any([
    x.startswith(i) for i in select_l2_set]))]

In [29]:
len(df_tax_select_l2s), len(select_l2_set)

(41, 41)

In [30]:
set(df_tax_select_l2s['category_path']) == select_l2_set

True

In [31]:
select_l2_ids_set = set([str(i) for i in df_tax_select_l2s['id'].tolist()])

In [32]:
sam_select_l2_ids_set = [1255,
1308,
2330,
2479,
2537,
2649,
2770,
2924,
3237,
3897,
4081,
4108,
4115,
4767,
5000,
4286,
1638,
5041,
5140,
4884,
2718,
2879,
4152,
4145,
4741,
1245,
1313,
1316,
1466,
1408,
1494,
1379,
1337,
1381,
1349,
1380,
1744,
1769,
1645,
4714,
3543]

In [33]:
len(set([str(i) for i in sam_select_l2_ids_set])), len(select_l2_ids_set)

(41, 41)

In [34]:
set([str(i) for i in sam_select_l2_ids_set]) - select_l2_ids_set

{'4286'}

In [35]:
select_l2_ids_set - set([str(i) for i in sam_select_l2_ids_set])

{'5229'}

In [36]:
df_tax[df_tax['id'].apply(lambda x: x in [4286, 5229])]

Unnamed: 0,category_tree_version,id,category_path,is_leaf
32,v1.2.1,5229,Sports > Sneakers,False
3962,v1.2.1,4286,Mother & Kids > Baby Shoes > Sneakers,True


In [37]:
df_tax_latest[df_tax_latest['id'].apply(lambda x: x in [4286, 5229])]

Unnamed: 0,category_tree_version,id,category_path,is_leaf
287,v2.0.1,5229,Sports > Sneakers,False
1928,v2.0.1,4286,Mother & Kids > Baby Shoes > Sneakers,True


In [38]:
# Define the export table
export_table = {
    "name": "collection_export_tmp0_041723",
    "columns": [
        {"name": "pid", "type": "STRING"},
        {"name": "category_id", "type": "INTEGER"},
        {"name": "category_path", "type": "STRING"},
        {"name": "title", "type": "STRING"},
        {"name": "description", "type": "STRING"}
    ]
}
db = "sweeper_dev"

In [20]:
# Create the export table
create_external_table(
    table_name=export_table["name"],
    table_definition=export_table,
    db=db,
    bucket=result_bucket
)

2023-04-17 23:35:04,609 INFO: USE `default`
2023-04-17 23:35:05,621 INFO: 
    CREATE TABLE sweeper_dev.collection_export_tmp0_041723 (
    pid STRING,
	category_id INTEGER,
	category_path STRING,
	title STRING,
	description STRING
    )
    
STORED AS PARQUET
LOCATION 's3://wish-tahoe-query-results/sweeper_dev/collection_export_tmp0_041723'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
2023-04-17 23:35:06,545 INFO: The query returned no records.


In [39]:
# Check the number of rows in the external table
q = f"SELECT COUNT(*) FROM {db}.{export_table['name']}"
execute_async(q)

2023-04-17 23:37:59,937 INFO: SELECT COUNT(*) FROM sweeper_dev.collection_export_tmp0_041723


[(0,)]

In [40]:
ids_q = ','.join(["'" + str(i) + "'" for i in select_l2_ids_set])

In [41]:
q = f"""
INSERT INTO {db}.{export_table['name']}

WITH inappropriate_pids AS (
  SELECT pid
  FROM supply.microtagging_sensitive_products
  WHERE flag_for_removal = 1
) 

, top_products AS (
  SELECT
      a.product_id
  FROM analytics.agg_product_daily_stats a
  JOIN ( 
      SELECT product_id
      FROM wish.master_products
      WHERE merchant_id IS NOT NULL
      AND is_deleted != 'true'
      AND product_removed_by_merchant != 'true'
      AND commerce_active = 1
      AND state IN ('0','1') -- NEW or ACTIVE
  ) b
  ON a.product_id = b.product_id
  LEFT JOIN inappropriate_pids
  ON inappropriate_pids.pid = a.product_id
  WHERE a.gmv_365d > 0
  AND inappropriate_pids.pid IS NULL
)

, top_products_limiting_L2 AS (
  SELECT 
    top_products.product_id
    , category.category_id
    , category.category_path 
  FROM top_products
  JOIN 
    (
      SELECT product_id, category_id 
      FROM structured_data.wish_product_predicted_category
    ) product 
  ON top_products.product_id = product.product_id
  JOIN (
    SELECT 
      id AS category_id
      -- , name AS category_name
      , category_path
      -- , trim(regexp_extract(category_path, '^[^>]*>[^>]*')) AS L2 
      -- , trim(regexp_extract(category_path, '^[^>]*')) AS L1 
      -- , split_part(category_path, ' > ', 2) as l2_name
    FROM structured_data.wish_product_categories
    WHERE split_part(category_id_path, ',', 2) in (
      {ids_q}
    )
  ) category 
  ON product.category_id = category.category_id
)

SELECT 
  top_products_limiting_L2.product_id
  , top_products_limiting_L2.category_id
  , top_products_limiting_L2.category_path
  , text.name AS title 
  , text.product_description
  -- , variations.color 
  -- , variations.size 
FROM
(
  SELECT _id AS product_id, name, product_description 
  FROM wish.dim_all_products
) text 
JOIN 
  top_products_limiting_L2
ON text.product_id = top_products_limiting_L2.product_id"""
execute_async(q)

2023-04-17 23:38:44,945 INFO: 
INSERT INTO sweeper_dev.collection_export_tmp0_041723

WITH inappropriate_pids AS (
  SELECT pid
  FROM supply.microtagging_sensitive_products
  WHERE flag_for_removal = 1
) 

, top_products AS (
  SELECT
      a.product_id
  FROM analytics.agg_product_daily_stats a
  JOIN ( 
      SELECT product_id
      FROM wish.master_products
      WHERE merchant_id IS NOT NULL
      AND is_deleted != 'true'
      AND product_removed_by_merchant != 'true'
      AND commerce_active = 1
      AND state IN ('0','1') -- NEW or ACTIVE
  ) b
  ON a.product_id = b.product_id
  LEFT JOIN inappropriate_pids
  ON inappropriate_pids.pid = a.product_id
  WHERE a.gmv_365d > 0
  AND inappropriate_pids.pid IS NULL
)

, top_products_limiting_L2 AS (
  SELECT 
    top_products.product_id
    , category.category_id
    , category.category_path 
  FROM top_products
  JOIN 
    (
      SELECT product_id, category_id 
      FROM structured_data.wish_product_predicted_category
    ) produ

[(1007384,)]

In [42]:
# Check the number of rows in the external table
q = f"SELECT COUNT(*) FROM {db}.{export_table['name']}"
execute_async(q)

2023-04-17 23:53:19,812 INFO: SELECT COUNT(*) FROM sweeper_dev.collection_export_tmp0_041723


[(1007384,)]

In [43]:
# Define the export table
export_table2 = {
    "name": "collection_export_tmp2_041723",
    "columns": [
        {"name": "product_id", "type": "STRING"},
        {"name": "color", "type": "STRING"},
        {"name": "size", "type": "STRING"}
    ]
}

In [44]:
# Create the export table
create_external_table(
    table_name=export_table2["name"],
    table_definition=export_table2,
    db=db,
    bucket=result_bucket
)

2023-04-17 23:55:20,931 INFO: USE `default`
2023-04-17 23:55:21,860 INFO: 
    CREATE TABLE sweeper_dev.collection_export_tmp2_041723 (
    product_id STRING,
	color STRING,
	size STRING
    )
    
STORED AS PARQUET
LOCATION 's3://wish-tahoe-query-results/sweeper_dev/collection_export_tmp2_041723'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
2023-04-17 23:55:22,778 INFO: The query returned no records.


In [45]:
# Check the number of rows in the external table
q = f"SELECT COUNT(*) FROM {db}.{export_table2['name']}"
execute_async(q)

2023-04-17 23:56:20,442 INFO: SELECT COUNT(*) FROM sweeper_dev.collection_export_tmp2_041723


[(0,)]

In [46]:
q = f"""
INSERT INTO {db}.{export_table2['name']}
SELECT
     a.product_id,
     a.color AS color,
     a.size AS size
FROM wish.master_product_variations a
   INNER JOIN
{db}.{export_table['name']} b ON
a.product_id = b.pid
"""
execute_async(q)

2023-04-17 23:57:20,449 INFO: 
INSERT INTO sweeper_dev.collection_export_tmp2_041723
SELECT
     a.product_id,
     a.color AS color,
     a.size AS size
FROM wish.master_product_variations a
   INNER JOIN
sweeper_dev.collection_export_tmp0_041723 b ON
a.product_id = b.pid



[(21067388,)]

In [47]:
# Check the number of rows in the external table
q = f"SELECT COUNT(*) FROM {db}.{export_table2['name']}"
execute_async(q)

2023-04-18 00:02:20,411 INFO: SELECT COUNT(*) FROM sweeper_dev.collection_export_tmp2_041723


[(21067388,)]

In [48]:
# Define the export table
export_table3 = {
    "name": "collection_export_tmp3_041723",
    "columns": [
        {"name": "product_id", "type": "STRING"},
        {"name": "color", "type": "ARRAY<STRING>"},
        {"name": "size", "type": "ARRAY<STRING>"}
    ]
}

In [49]:
# Create the export table
create_external_table(
    table_name=export_table3["name"],
    table_definition=export_table3,
    db=db,
    bucket=result_bucket
)

2023-04-18 00:04:21,017 INFO: USE `default`
2023-04-18 00:04:21,915 INFO: 
    CREATE TABLE sweeper_dev.collection_export_tmp3_041723 (
    product_id STRING,
	color ARRAY<STRING>,
	size ARRAY<STRING>
    )
    
STORED AS PARQUET
LOCATION 's3://wish-tahoe-query-results/sweeper_dev/collection_export_tmp3_041723'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
2023-04-18 00:04:22,775 INFO: The query returned no records.


In [50]:
q = f"""
INSERT INTO {db}.{export_table3['name']}
SELECT
     product_id,
     array_agg(DISTINCT color) AS color,
     array_agg(DISTINCT size) AS size
FROM {db}.{export_table2['name']}
GROUP BY product_id
"""
execute_async(q)

2023-04-18 00:05:20,411 INFO: 
INSERT INTO sweeper_dev.collection_export_tmp3_041723
SELECT
     product_id,
     array_agg(DISTINCT color) AS color,
     array_agg(DISTINCT size) AS size
FROM sweeper_dev.collection_export_tmp2_041723
GROUP BY product_id



[(1007339,)]

In [51]:
# Define the export table
export_table4 = {
    "name": "collection_export_tmp4_041723",
    "columns": [
        {"name": "product_id", "type": "STRING"},
        {"name": "category_id", "type": "INTEGER"},
        {"name": "category_path", "type": "STRING"},
        {"name": "title", "type": "STRING"},
        {"name": "description", "type": "STRING"},
        {"name": "color", "type": "ARRAY<STRING>"},
        {"name": "size", "type": "ARRAY<STRING>"}
    ]
}

In [52]:
# Create the export table
create_external_table(
    table_name=export_table4["name"],
    table_definition=export_table4,
    db=db,
    bucket=result_bucket
)

2023-04-18 00:08:20,884 INFO: USE `default`
2023-04-18 00:08:21,782 INFO: 
    CREATE TABLE sweeper_dev.collection_export_tmp4_041723 (
    product_id STRING,
	category_id INTEGER,
	category_path STRING,
	title STRING,
	description STRING,
	color ARRAY<STRING>,
	size ARRAY<STRING>
    )
    
STORED AS PARQUET
LOCATION 's3://wish-tahoe-query-results/sweeper_dev/collection_export_tmp4_041723'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
2023-04-18 00:08:22,707 INFO: The query returned no records.


In [53]:
q = f"""
INSERT INTO {db}.{export_table4['name']}
SELECT
     a.pid AS product_id,
     a.category_id AS category_id, 
     a.category_path AS category_path, 
     a.title AS title, 
     a.description AS description, 
     b.color AS color,
     b.size AS size
FROM {db}.{export_table['name']} a
LEFT JOIN
{db}.{export_table3['name']} b
ON a.pid = b.product_id
"""
execute_async(q)

2023-04-18 00:09:20,398 INFO: 
INSERT INTO sweeper_dev.collection_export_tmp4_041723
SELECT
     a.pid AS product_id,
     a.category_id AS category_id, 
     a.category_path AS category_path, 
     a.title AS title, 
     a.description AS description, 
     b.color AS color,
     b.size AS size
FROM sweeper_dev.collection_export_tmp0_041723 a
LEFT JOIN
sweeper_dev.collection_export_tmp3_041723 b
ON a.pid = b.product_id



[(1007384,)]

In [54]:
q = f"SELECT COUNT(*) FROM {db}.{export_table['name']}"
execute_async(q)

2023-04-18 00:10:20,013 INFO: SELECT COUNT(*) FROM sweeper_dev.collection_export_tmp0_041723


[(1007384,)]

In [55]:
q = f"SELECT COUNT(*) FROM {db}.{export_table4['name']}"
execute_async(q)

2023-04-18 00:11:20,418 INFO: SELECT COUNT(*) FROM sweeper_dev.collection_export_tmp4_041723


[(1007384,)]

In [56]:
# Show the data files for the external table
file_keys = get_s3_file_keys(s3_bucket=result_bucket, s3_prefix=f"{db}/{export_table4['name']}")
file_keys

[('sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_216b3624-8a81-4da3-b911-30bb4b43af4f',
  83),
 ('sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_331d483f-98ae-4a85-adc1-89e426a0a76f',
  7),
 ('sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_79d0c10c-6d3d-4ad4-9d5f-57bee4830abe',
  215),
 ('sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_90a22af5-54be-4c87-aeb2-24572133fabc',
  220),
 ('sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_a8e61b13-a310-4e45-adde-22eaa186a406',
  7),
 ('sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_cf32beb2-1c85-4a50-ae3a-d63096496a34',
  82)]

In [57]:
dfs = []
for file_key, file_size in file_keys:
    df_chunk = get_df_from_parquet(s3_bucket=result_bucket, s3_key=file_key)
    dfs.append(df_chunk)
    
df_data = pd.concat(dfs)
df_data.head()

2023-04-18 00:13:20,428 INFO: Importing from sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_216b3624-8a81-4da3-b911-30bb4b43af4f...
2023-04-18 00:13:26,067 INFO: Imported DF (137132, 7) from sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_216b3624-8a81-4da3-b911-30bb4b43af4f.
2023-04-18 00:13:26,077 INFO: Importing from sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_331d483f-98ae-4a85-adc1-89e426a0a76f...
2023-04-18 00:13:27,106 INFO: Imported DF (11693, 7) from sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_331d483f-98ae-4a85-adc1-89e426a0a76f.
2023-04-18 00:13:27,115 INFO: Importing from sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_79d0c10c-6d3d-4ad4-9d5f-57bee4830abe...
2023-04-18 00:13:33,795 INFO: Imported DF (351460, 7) from sweeper_dev/collection_export_tmp4_041723/20230418_000920_00263_7r28x_79d0c10c-6d3d-4ad4-9d5f-57bee4830abe.
2023-04-18 00:13:33,805 INFO: Impor

Unnamed: 0,product_id,category_id,category_path,title,description,color,size
0,6190b08034553b253d3e6d59,5013,Sports > Fishing > Fishing Accessories > Fishi...,High Quality Folding Portable Quick-drying Alu...,Features:\n\nHigh Strength Stainless Steel Rin...,[None],"[30cm x 150cm, 5cm x 110cm, 33cm x 200cm]"
1,58403e20e1734713a67ed50c,2344,"Home & Garden > Arts, Crafts & Sewing > Appare...","300Pcs charm Crystal,4mm Interval of Bicone lo...",Material : crystal Quantity：300pcs\nsize：4...,[black],[4mm by 4mm]
2,5823e3abb59cd735d481ef52,5008,Sports > Fishing > Fish Finders,New Underwater Fishing Deep Drop Fish Trap Gat...,Color: White\nPowered by: 5 # battery (NOT inc...,[None],[None]
3,625d300373936eefee03fabb,2664,Home & Garden > Home Decor > Christian Decor >...,""" Good Days Start with Coffee Jesus""Wooden Han...",【Desciption】\n【Material】:wood\n【Size】:Length*w...,[None],"[8 pcs, 6 pcs, 7 pcs, wholesale 10pcs, 9 pcs, ..."
4,620b485ee63a017d03dfa2b6,2725,Home & Garden > Home Storage & Organization > ...,Punch-free Bathroom Shelf Shelves Shampoo Show...,Specification:\n\nMaterial: Space aluminum\nIn...,[None],"[Golden Standard, Golden With pole]"


In [58]:
df_data.to_json('product_attribute_extraction_2023q2tli_041723.json', lines=True, orient='records')

In [59]:
len(df_data)

1007384

In [60]:
df_data.sample(3).to_dict('records')

[{'product_id': '614d9af32e2485c2381e7b50',
  'category_id': 2680,
  'category_path': 'Home & Garden > Home Decor > Flags, Banners & Accessories',
  'title': 'West Highland Terrier White On White House Not A Home -362 Flag 3x5 Ft',
  'description': '[Material description] made of 100% polyester fiber (polyester)\n[Product performance] Durable, not easy to fade, easy to disassemble, add a stylish atmosphere to the courtyard\n[Applicable scenarios] Decorative flags, mainly used in courtyards, gardens, flower pots, etc. Suitable for any terrace or balcony, and make charming decorations for any backyard or lawn.\n[Accessories structure] Easy to install, flagpoles are not included.\n[Size]150*90CM\n[Washing instructions] Hand wash and machine wash are acceptable, do not bleach.',
  'color': array([None], dtype=object),
  'size': array(['3x5 ft'], dtype=object)},
 {'product_id': '59127b4b4ee97415bd6cde38',
  'category_id': 3270,
  'category_path': 'Home & Garden > Pet Products > Dog Carriers

In [61]:
for t in [ 
    export_table, export_table2, export_table3, export_table4
]:
    drop_external_table(
        db=db,
        table_name=t["name"],
        delete_files=True,
        s3_bucket=result_bucket,
        s3_prefix=f"{db}/{t['name']}", 
    )

2023-04-18 00:17:20,835 INFO: USE `default`
2023-04-18 00:17:21,813 INFO: 
    DROP TABLE IF EXISTS sweeper_dev.collection_export_tmp0_041723
    
2023-04-18 00:17:22,990 INFO: The query returned no records.
2023-04-18 00:17:22,992 INFO: Dropped sweeper_dev.collection_export_tmp0_041723
2023-04-18 00:17:23,542 INFO: Files in 's3://wish-tahoe-query-results/sweeper_dev/collection_export_tmp0_041723 are deleted.
2023-04-18 00:17:23,966 INFO: USE `default`
2023-04-18 00:17:24,906 INFO: 
    DROP TABLE IF EXISTS sweeper_dev.collection_export_tmp2_041723
    
2023-04-18 00:17:25,852 INFO: The query returned no records.
2023-04-18 00:17:25,853 INFO: Dropped sweeper_dev.collection_export_tmp2_041723
2023-04-18 00:17:26,301 INFO: Files in 's3://wish-tahoe-query-results/sweeper_dev/collection_export_tmp2_041723 are deleted.
2023-04-18 00:17:26,730 INFO: USE `default`
2023-04-18 00:17:27,668 INFO: 
    DROP TABLE IF EXISTS sweeper_dev.collection_export_tmp3_041723
    
2023-04-18 00:17:28,588 INF

# check pulled data

In [14]:
df_data_load = pd.read_json('product_attribute_extraction_2023q2tli_041723.json', lines=True)

In [15]:
df_data_load.sample(3).to_dict('records')

[{'product_id': '60ecfe4aac249c34fd4957a0',
  'category_id': 4928,
  'category_path': 'Sports > Camping & Hiking > Tents & Shelters > Tent Accessories',
  'title': '10Pcs Windproof Outdoor Awning Tent Fixed Hook Buckle Shelf Floor Spring Nails Fixed Hook Buckle Camping Accessories',
  'description': "Features \n\nColor: As picture\nSize: Height approximately 7.5cm width approximately 3.3cm error approximately 1cm\nMaterial: Aluminum alloy\n \nPackage Content:\n10 PCS \n    \nKindly Tip:     \n 1. Conversion:  1 inch= 2.54cm                                                                                                            \nSize Approximately :  These size is about Asian Size,general get one size smaller than US/EU size,Please reference to the size chart in the picture. Item Measure by hand, it could be 1-3cm different. hope you can understanding, will be sincerely appreciated.\n\n2 .Please be reminded that due lighting effects, monitor's brightness/ contrast settings etc, there

In [16]:
len(df_data_load)

1007384

In [17]:
data_l2s = set(df_data_load.category_path.apply(lambda x: ' > '.join(x.split(' > ')[:2])))

In [18]:
select_l2_set - data_l2s

{'Computer & Office > Mini PC',
 'Home & Garden > Kitchen,Dining & Bar',
 'Jewelry & Accessories > Necklaces & Pendants',
 'Jewelry & Accessories > Rings'}

In [19]:
data_l2s - select_l2_set

{'Computer & Office > Mini PCs', 'Home & Garden > Kitchen, Dining & Bar'}

In [20]:
set(df_data_load[df_data_load.category_path.apply(lambda x: x.startswith('Jewelry & Accessories'))].category_path.apply(
    lambda x: ' > '.join(x.split(' > ')[:2])
))

{'Jewelry & Accessories > Fine Jewelry'}

In [21]:
df_metadata = pd.read_csv("/workspaces/multitask-llm-rnd/datasets/data/attribute_extraction_metadata_template/Initial Attribute Definition for First Release - UPDATED SHEET .csv")
df_metadata = df_metadata[['category', 'category_id', 'attribute_field', 'category_attributevalue', 'description', 'max_multi_select', 'usage', 'entry mode']]
metadata_category = set(df_metadata.category)
metadata_category_ids = set(df_metadata.category_id)
category2config = {}
for i in metadata_category:
    df_tmp = df_metadata[df_metadata.category == i]
    category2config[i] = sorted(df_tmp.to_dict('records'), key=lambda x: x['attribute_field'])
categoryid2config = {}
for i in metadata_category_ids:
    df_tmp = df_metadata[df_metadata.category_id == i]
    categoryid2config[i] = sorted(df_tmp.to_dict('records'), key=lambda x: x['attribute_field'])


In [22]:
len(set(df_data_load.category_id))

1338

In [23]:
len(set(categoryid2config))

2658

In [24]:
len(set(df_data_load.category_id).intersection(set(categoryid2config)))

1233

In [25]:
len(set(df_data_load.category_id) - (set(categoryid2config)))

105

In [26]:
idswithattr = set(df_data_load.category_id).intersection(set(categoryid2config))

In [27]:
idswithoutattr = set(df_data_load.category_id) - (set(categoryid2config))

In [28]:
len(df_data_load[df_data_load.category_id.apply(lambda x: x in idswithattr)])

969437

In [29]:
len(df_data_load[df_data_load.category_id.apply(lambda x: x in idswithoutattr)])

37947

In [30]:
df_data_load_valid = df_data_load[df_data_load.category_id.apply(lambda x: x in idswithattr)]

In [31]:
len(df_data_load_valid)

969437

In [33]:
df_data_load_valid_sample = df_data_load_valid.groupby('category_id').sample(10, replace=True).drop_duplicates('product_id').reset_index()

In [34]:
len(df_data_load_valid_sample)

11328

# prompt

In [5]:
import asyncio
import aiohttp
import logging
import os 
from copy import deepcopy
import random
import json

OPENAI_KEY = os.environ['WISH_OPENAI_KEY_DEV']
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_KEY}"
}

async def call_oai(session, data):
    try:
        async with session.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=data
        ) as response:
            res = await response.json()
        return res
    except Exception as e:
        logging.error(f"{data} failed due to {e}")
        return None

async def call_oais(datas):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for data in datas:
            task = asyncio.ensure_future(call_oai(session, data))
            tasks.append(task)
        reses = await asyncio.gather(*tasks)
        return reses

In [35]:
df_md_anno = pd.read_json('/workspaces/multitask-llm-rnd/datasets/data/offshore_attr_extract/merchant_dashboard/pdp/output/product_attr_extract_md_03252023_result_04172023.json', 
    lines=True).drop_duplicates('product_id')

In [36]:
def clean_human_result_return(d_):
    try:
        d = deepcopy(d_)
        for k in d:
            if len(d[k]) > 0 and d[k][0].strip().lower() == 'not sure':
                d[k] = []
        return d
    except Exception as e:
        print(d, e)
        return None

df_md_anno['extraction_result_clean'] = df_md_anno['extraction_result'].apply(
    lambda x: clean_human_result_return(eval(x)))

In [37]:
from collections import defaultdict

In [38]:
def sample_from_strified_list(t):
    l = eval(t)
    if len(l) > 1:
        return random.sample(l, min(3, len(l))) + ['... and more']
    else:
        return "Free text, please extract from product title and description"


In [39]:
df_md_anno_sample = df_md_anno.groupby('category_id').sample(1, random_state=42)

In [40]:
id2annorec = {}

In [41]:
for i in df_md_anno_sample.to_dict('records'):
    id2annorec[i['category_id']] = i

In [30]:
rec = df_data_load.sample(1).to_dict('records')[0]

In [31]:
set(rec)

{'category_id',
 'category_path',
 'color',
 'description',
 'product_id',
 'size',
 'title'}

In [32]:
import random 
random.seed(42)

In [33]:
if rec['category_id'] in id2annorec:
    rec_oneshot = id2annorec[rec['category_id']]
else:
    rand_cat = random.sample(list(id2annorec), 1)[0]
    rec_oneshot = id2annorec[rand_cat]

In [34]:
title_oneshot = rec_oneshot['title']
desp_oneshot = rec_oneshot['product_description']

In [35]:
attrs_oneshot = '\n'.join([f"{i['attribute_field']}: {sample_from_strified_list(i['category_attributevalue'])}" \
    for i in categoryid2config[rec_oneshot['category_id']]])


In [36]:
attr_template_oneshot = {i['attribute_field']: [] for i in categoryid2config[rec_oneshot['category_id']]}

In [37]:
human_result_oneshot = rec_oneshot['extraction_result_clean']

In [38]:
assert set(attr_template_oneshot) == set(human_result_oneshot)

In [39]:
human_result_oneshot = {i: human_result_oneshot[i] for i in attr_template_oneshot}

In [40]:
title_infer = rec['title']
desp_infer = rec['description']
attrs_infer = '\n'.join([f"{i['attribute_field']}: {sample_from_strified_list(i['category_attributevalue'])}" \
    for i in categoryid2config[rec['category_id']]])
attr_template_infer = {i['attribute_field']: [] for i in categoryid2config[rec['category_id']]}

In [41]:
msgs = [
    {"role": "system", "content": """You are a state of art NER system that extracts all attribute key value pairs from e commerce data by filling in the blank."""},
    {"role": "user", "content": f"""[title start] {title_oneshot} [title end] [description start] {desp_oneshot} [description end] Fill in the blank here: {json.dumps(attr_template_oneshot)}"""},
    {"role": "assistant", "content": json.dumps(human_result_oneshot)},
    {"role": "user", "content": f"""Do better, you are state of art. [title start] {title_infer} [title end] [description start] {desp_infer} [description end] Fill in the blank here: {json.dumps(attr_template_infer)}"""},
]

In [42]:
await call_oais([ 
    {
        "model": "gpt-3.5-turbo",
        "messages": msgs,
        "temperature": 0,
        "max_tokens": 500
    }
])

[{'id': 'chatcmpl-76gJE9tq27sUPigkyK2FKcJcVDGY0',
  'object': 'chat.completion',
  'created': 1681826960,
  'model': 'gpt-3.5-turbo-0301',
  'usage': {'prompt_tokens': 786,
   'completion_tokens': 42,
   'total_tokens': 828},
  'choices': [{'message': {'role': 'assistant',
     'content': '{"Alpha Size": ["50"], "Fishing Reel or Rod Type": [], "Primary Color": ["No Color"], "Rod Length (Include Unit of Measure)": [], "Rod Power Level": []}'},
    'finish_reason': 'stop',
    'index': 0}]}]

In [51]:
90000 / 1000 / (60 / 10)

15.0

In [43]:
res = await call_oais([ 
    {
        "model": "gpt-3.5-turbo",
        "messages": msgs,
        "temperature": 0,
        "max_tokens": 500
    }
] * 15)

In [44]:
await call_oais([ 
    {
        "model": "gpt-3.5-turbo",
        "messages": msgs,
        "temperature": 0,
        "max_tokens": 500
    },
    {
        "model": "gpt-3.5-turbo",
        "messages": {},
        "temperature": 0,
        "max_tokens": 500
    }
])

[{'id': 'chatcmpl-76gK7O5y3pxrmCNFf6QceoaGO4S1x',
  'object': 'chat.completion',
  'created': 1681827015,
  'model': 'gpt-3.5-turbo-0301',
  'usage': {'prompt_tokens': 786,
   'completion_tokens': 42,
   'total_tokens': 828},
  'choices': [{'message': {'role': 'assistant',
     'content': '{"Alpha Size": ["50"], "Fishing Reel or Rod Type": [], "Primary Color": ["No Color"], "Rod Length (Include Unit of Measure)": [], "Rod Power Level": []}'},
    'finish_reason': 'stop',
    'index': 0}]},
 {'error': {'message': "{} is not of type 'array' - 'messages'",
   'type': 'invalid_request_error',
   'param': None,
   'code': None}}]

In [42]:
def create_oai_request(rec):
    if rec['category_id'] in id2annorec:
        rec_oneshot = id2annorec[rec['category_id']]
    else:
        rand_cat = random.sample(list(id2annorec), 1)[0]
        rec_oneshot = id2annorec[rand_cat]
    title_oneshot = rec_oneshot['title']
    desp_oneshot = rec_oneshot['product_description']

    attr_template_oneshot = {i['attribute_field']: [] for i in categoryid2config[rec_oneshot['category_id']]}

    human_result_oneshot = rec_oneshot['extraction_result_clean']
    assert set(attr_template_oneshot) == set(human_result_oneshot)
    human_result_oneshot = {i: human_result_oneshot[i] for i in attr_template_oneshot}

    title_infer = rec['title']
    desp_infer = rec['description']

    attr_template_infer = {i['attribute_field']: [] for i in categoryid2config[rec['category_id']]}

    msgs = [
        {"role": "system", "content": """You are a state of art NER system that extracts all attribute key value pairs from e commerce data by filling in the blank."""},
        {"role": "user", "content": f"""[title start] {title_oneshot} [title end] [description start] {desp_oneshot} [description end] Fill in the blank here: {json.dumps(attr_template_oneshot)}"""},
        {"role": "assistant", "content": json.dumps(human_result_oneshot)},
        {"role": "user", "content": f"""Do better, you are state of art. [title start] {title_infer} [title end] [description start] {desp_infer} [description end] Fill in the blank here: {json.dumps(attr_template_infer)}"""},
    ]

    return {
        "model": "gpt-3.5-turbo",
        "messages": msgs,
        "temperature": 0,
        "max_tokens": 500
    }




In [47]:
from tqdm import tqdm
with open('product_attribute_extraction_2023q2tli_041723_validprompt_041823.json', 'w') as f:
    for rec in tqdm(df_data_load_valid.to_dict('records')):
        rec['prompt'] = create_oai_request(rec)
        f.write(json.dumps(rec) + '\n')

100%|██████████| 969437/969437 [01:57<00:00, 8230.51it/s]


In [43]:
from tqdm import tqdm
with open('product_attribute_extraction_2023q2tli_041723_validprompt_041823_sample_041823.json', 'w') as f:
    for rec in tqdm(df_data_load_valid_sample.to_dict('records')):
        rec['prompt'] = create_oai_request(rec)
        f.write(json.dumps(rec) + '\n')

100%|██████████| 11328/11328 [00:01<00:00, 9325.86it/s]


In [2]:
df_data_load_valid_prompt = next(pd.read_json('product_attribute_extraction_2023q2tli_041723_validprompt_041823.json', lines=True, chunksize=15))

In [44]:
len(df_data_load_valid_sample)

11328

In [7]:
df_data_load_valid_prompt

Unnamed: 0,product_id,category_id,category_path,title,description,color,size,prompt
0,58403e20e1734713a67ed50c,2344,"Home & Garden > Arts, Crafts & Sewing > Appare...","300Pcs charm Crystal,4mm Interval of Bicone lo...",Material : crystal Quantity：300pcs\nsize：4...,[black],[4mm by 4mm],"{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
1,5823e3abb59cd735d481ef52,5008,Sports > Fishing > Fish Finders,New Underwater Fishing Deep Drop Fish Trap Gat...,Color: White\nPowered by: 5 # battery (NOT inc...,[None],[None],"{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
2,625d300373936eefee03fabb,2664,Home & Garden > Home Decor > Christian Decor >...,""" Good Days Start with Coffee Jesus""Wooden Han...",【Desciption】\n【Material】:wood\n【Size】:Length*w...,[None],"[8 pcs, 6 pcs, 7 pcs, wholesale 10pcs, 9 pcs, ...","{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
3,620b485ee63a017d03dfa2b6,2725,Home & Garden > Home Storage & Organization > ...,Punch-free Bathroom Shelf Shelves Shampoo Show...,Specification:\n\nMaterial: Space aluminum\nIn...,[None],"[Golden Standard, Golden With pole]","{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
4,60468658500a1bd5241a6807,2498,Home & Garden > Festive & Party Supplies > Chr...,Pride Rainbow Love is Love Heart Christmas Tre...,\n > Made of plastic and paint/mica\n > 4.5 in...,[Multicolor],[None],"{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
5,6134e085dc6dc1565aba3d9a,3183,"Home & Garden > Kitchen, Dining & Bar > Kitche...",1/ 2/3 PCS Free rotating hose Single arc charg...,"Color: Black, silver, blue, red, gold, rose go...","[silver, black, blue, red, gold, rosegold]","[3 pcs, 1 pc, 2 pcs]","{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
6,5c2ffb3a160b4330482391d8,2716,Home & Garden > Home Decor > Wall Stickers,2018 NEW Quotes Wall Decal English Well-known...,"Material: PVC\nFunction: anti mildew, mothproo...","[pink, white, black]",[None],"{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
7,619706d700cce6fbcd6b2cde,3272,Home & Garden > Pet Products > Dog Clothing & ...,Dog Hair Bows with Rubber Bands Cute Puppy Gro...,Feature: . Bowknot Size: This Halloween dog ru...,[None],[None],"{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
8,6181f0f8fd4f44a63844a444,4912,Sports > Camping & Hiking > Hiking Clothing > ...,2/4/9/11 Zone Heating Mens Winter Heated USB F...,Heating Padded Coat USB Charging Solid Color C...,"[black, None, red, blue]","[9 Zone Heating - 2XL, 4 Zone Heating - S, 11 ...","{'model': 'gpt-3.5-turbo', 'messages': [{'role..."
9,5ee8e50078ad497ec94e15c1,2788,Home & Garden > Home Textile > Carpets & Rugs ...,Classical Brown Horse Anti-slip Area Door Mat ...,Pattern Type:Classical brown horse carpet\nCat...,[None],"[60 cm, 100 cm, 80 cm]","{'model': 'gpt-3.5-turbo', 'messages': [{'role..."


In [6]:
res = await call_oais(df_data_load_valid_prompt.prompt.tolist())

In [9]:
len(res)

15

In [10]:
res[0]

{'id': 'chatcmpl-76gWy25I64RrKwTb7xOwP5SB6MsvG',
 'object': 'chat.completion',
 'created': 1681827812,
 'model': 'gpt-3.5-turbo-0301',
 'usage': {'prompt_tokens': 726,
  'completion_tokens': 116,
  'total_tokens': 842},
 'choices': [{'message': {'role': 'assistant',
    'content': '{"Age Range Description": [], "Brand Name": [], "Finish": [], "Item Height (Include Unit of Measure)": [], "Item Length (Include Unit of Measure)": [], "Item Types": ["Beads"], "Item Width (Include Unit of Measure)": [], "Materials": ["Crystal"], "Model or Model Number": [], "Occasion & Lifestyle": [], "Pattern": [], "Primary Color": ["Black"], "Shape": ["Bicone"], "Sizes": ["4mm"], "Styles": ["DIY", "Loose"], "Surface Recommendation": []}'},
   'finish_reason': 'stop',
   'index': 0}]}

# inspect

In [10]:
from run_oai_inference import call_oais

In [2]:
df_inspect = pd.read_json('product_attribute_extraction_2023q2tli_041723_validprompt_041823_sample_041823_oaiinfer_041823.json', lines=True)

In [3]:
len(df_inspect)

11328

In [8]:
(df_inspect['oai_response'].isna()).any()

True

In [11]:
df_error = df_inspect[df_inspect['oai_response'].isna()]

In [12]:
df_error

Unnamed: 0,index,product_id,category_id,category_path,title,description,color,size,prompt,oai_response
5273,981478,60ebed6adb224f00035edb71,2890,Home & Garden > Household Merchandise > Dust C...,Refrigerator Dust Cover With Pocket Storage Ba...,Refrigerator Dust Cover With Pocket Storage Ba...,[lightgrey],[None],"{'model': 'gpt-3.5-turbo', 'messages': [{'role...",
6190,416462,61b0c9ba3e9f24d9018e90f5,3005,"Home & Garden > Kitchen, Dining & Bar > Cookwa...",Lodge 12 Inch Cast Iron Lid. Classic 12-Inch C...,Size:12 Inch\n Product Description Lodge 12 In...,[None],[None],"{'model': 'gpt-3.5-turbo', 'messages': [{'role...",
8468,635154,5d6cb7ef6920d0322f6aa997,3300,Home & Garden > Pet Products > Dog Feeding,250ml Pet Dog Cat Water Bottle Portable Feeder...,"Description:\n\nABS sink, silicone seal ring, ...","[purple, blue, green, pink]",[None],"{'model': 'gpt-3.5-turbo', 'messages': [{'role...",
9442,552688,60dc4c776596e4569e4bc1ff,3976,Home Improvement > Lights & Lighting > Outdoor...,Solar Garden Full Moon LED Lights Outdoor Glob...,Product Description：\nThe moon garden solar li...,"[blue, gold, None]","[Flame, None, Moon, Sun]","{'model': 'gpt-3.5-turbo', 'messages': [{'role...",


In [14]:
await call_oais(df_error.prompt.tolist())

[{'id': 'chatcmpl-76hVcZ6oQ9eL3nh2OCjIPwpstpOxD',
  'object': 'chat.completion',
  'created': 1681831572,
  'model': 'gpt-3.5-turbo-0301',
  'usage': {'prompt_tokens': 688,
   'completion_tokens': 98,
   'total_tokens': 786},
  'choices': [{'message': {'role': 'assistant',
     'content': '{"Alpha Size": [], "Brand Name": ["RLJLIVES"], "Item Height (Include Unit of Measure)": [], "Item Length (Include Unit of Measure)": ["128cm"], "Item Width (Include Unit of Measure)": ["54cm"], "Materials": ["Stainless steel"], "Model or Model Number": [], "Pattern": [], "Primary Color": ["Gray"], "Styles": ["Refrigerator Dust Cover", "Storage Organizer Bags", "Hanging Bag"]}'},
    'finish_reason': 'stop',
    'index': 0}]},
 {'id': 'chatcmpl-76hVcxZRWPbThg9CRjfsNPkLk156J',
  'object': 'chat.completion',
  'created': 1681831572,
  'model': 'gpt-3.5-turbo-0301',
  'usage': {'prompt_tokens': 970,
   'completion_tokens': 147,
   'total_tokens': 1117},
  'choices': [{'message': {'role': 'assistant',
   

In [15]:
df_inspect.loc[df_inspect['oai_response'].isna(), 'oai_response'] = await call_oais(
    df_inspect.loc[df_inspect['oai_response'].isna(), 'prompt'].tolist())

In [16]:
(df_inspect['oai_response'].isna()).any()

False

In [17]:
df_inspect.to_json('product_attribute_extraction_2023q2tli_041723_validprompt_041823_sample_041823_oaiinfer_041823_nonan_041823.json', 
    orient='records', lines=True)

# parse

In [2]:
df_inspect = pd.read_json(
    'product_attribute_extraction_2023q2tli_041723_validprompt_041823_sample_041823_oaiinfer_041823_nonan_041823.json', 
    lines=True
)

In [4]:
import json

In [5]:
def parse_oai_response(d):
    try:
        return json.loads(d['choices'][0]['message']['content'])
    except Exception as e:
        print(e)
        return None

In [6]:
df_inspect['oai_response_parsed'] = df_inspect['oai_response'].apply(parse_oai_response)

Extra data: line 3 column 1 (char 152)
Extra data: line 3 column 1 (char 98)
Extra data: line 3 column 1 (char 99)
Extra data: line 3 column 1 (char 98)
Extra data: line 3 column 1 (char 98)
Extra data: line 3 column 1 (char 99)
Extra data: line 3 column 1 (char 167)
Expecting value: line 1 column 1430 (char 1429)
Unterminated string starting at: line 1 column 1355 (char 1354)
Extra data: line 1 column 98 (char 97)
Extra data: line 3 column 1 (char 85)
Extra data: line 3 column 1 (char 85)
Extra data: line 3 column 1 (char 85)
Extra data: line 3 column 1 (char 85)
Expecting value: line 1 column 1 (char 0)
Extra data: line 3 column 1 (char 80)
Extra data: line 3 column 1 (char 81)
Extra data: line 3 column 1 (char 200)
Extra data: line 3 column 1 (char 200)
Extra data: line 3 column 1 (char 121)
Extra data: line 3 column 1 (char 98)
Extra data: line 3 column 1 (char 98)
Extra data: line 3 column 1 (char 85)
Extra data: line 3 column 1 (char 85)
Extra data: line 3 column 1 (char 85)
Extr

In [9]:
df_inspect.oai_response_parsed.isna().any()

True

In [10]:
df_inspect.oai_response_parsed.isna().mean()

0.014565677966101694

In [12]:
df_inspect.to_json('product_attribute_extraction_2023q2tli_041723_validprompt_041823_sample_041823_oaiinfer_041823_nonan_041823_parsed_041823.json', 
    orient='records', lines=True)

In [15]:
df_inspect[['category_id', 'title', 'description', 'oai_response_parsed']].sample(2).to_dict('records')

[{'category_id': 3081,
  'title': 'Stainless Steel 5Style Fried Egg Pancake Shaper Omelette Mold Mould Frying Egg Cooking Tools Kitchen Accessories Gadget Rings',
  'description': 'Package Included:\n1/5PCS Stainless Steel Fried Egg Shaper Pancake Mold Mold Kitchen Cooking Tool (No Retail Package)\n\nfeature:\n\n100% brand new and high quality.\nQuantity: 1PC\nCute shaped molds for your sweetheart to boil eggs or pancakes\nand your family\nEasy to use and easy to clean\nDon\'t worry about getting burned\nGreat for pancakes, muffins and more\nMaterial: Stainless steel\nColor: silver\nDimensions: 10cm x 10cm / 3.94" x 3.94" (approx.)\nHeight: 1.5 cm / 0.59 inches (approx.)\nNote: The product handle is random\nnotes:\n\nDue to manual measurement, there may be a 1-2 cm error in the size\n\nColors may vary due to different monitors',
  'oai_response_parsed': {'Age Range Description': [],
   'Alpha Size': [],
   'Brand Name': [],
   'Care Instructions': [],
   'Certification': [],
   'Finish