In [1]:
import pandas as pd 
import dvc.api 
from tqdm import tqdm

In [2]:
df_test = pd.read_json(dvc.api.get_url( 
    'datasets/data/query_label/processed/Offshore_Labelled_Query_Classification_Test_V2.json',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
), lines=True)

In [3]:
df_tax = pd.read_json(dvc.api.get_url( 
    'datasets/data/taxonomy/wish_newtax.json',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
), lines=True)

In [7]:
tax2id = {}
id2name = {}
tax2name = {}
for i in df_tax.to_dict('records'):
    if len(i['category_path']) > 0:
        tax2id[i['category_path'].lower()] = str(i['id'])
        tax2name[i['category_path'].lower()] = i['category_path'].split(' > ')[-1]
        id2name[str(i['id'])] = i['category_path'].split(' > ')[-1]

In [6]:
df_test.head(2)

Unnamed: 0,index,filename,label_ordering,query,sample_method,gmv,cnt,query_classifications,orig_query,query_classification_lists,query_classification_ids,lang
0,75754,offshore/TieBreaker File - 8 Dec 22.xlsx,1,regal kinderzimmer,head,410.022095,1201,Furniture > Children Furniture > Children Ward...,regal kinderzimmer,[Furniture > Children Furniture > Children War...,"[2151, 2144, -1]",de
1,386969,offshore/TieBreaker File - 8 Dec 22.xlsx,2,wooden box with lid,uniform,53.815742,221,Home & Garden > Home Storage & Organization > ...,wooden box with lid,[Home & Garden > Home Storage & Organization >...,"[2733, 2766, 2375]",en


In [8]:
upload_recs = []

for rec in df_test.to_dict('records'):
    categories = []
    weights = []
    names = []
    for cat_id in rec['query_classification_ids']:
        if cat_id == -1:
            break
        else:
            categories.append(str(cat_id))
            weights.append(str(1))
            names.append(id2name[str(cat_id)])
    upload_rec = { 
        'query': rec['query'], 
        'categories': ','.join(categories),
        'category_names': ','.join(names),
        'weights': ','.join(weights),
        'dt': '2023-03-14',
        'model_version': 4, 
        'taxonomy_version': 121
    }
    upload_recs.append(upload_rec)
df_upload = pd.DataFrame(upload_recs)

In [12]:
df_upload.sample(2)

Unnamed: 0,query,categories,category_names,weights,dt,model_version,taxonomy_version
9484,gryffindor,421159751862,"T-Shirts,Quartz Watches,Literature & Fiction",111,2023-03-14,4,121
6514,car scratch repair,646643648,"Polishes,Paint Cleaner,Spot Rust & Tar Spot Re...",111,2023-03-14,4,121


In [13]:
from tahoe import create_table, create_external_table, drop_external_table, execute_async
from s3 import temp_bucket, get_s3_file_keys, upload_df_to_parquet
import pandas as pd
import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO
)

In [14]:
df_data = df_upload

In [15]:
len(df_data)

13448

In [16]:
# Create the permanent table
test_table = {
    "name": "query_top_3_humanlabels_v4",
    "columns": [
        {"name": "query", "type": "STRING"},
        {"name": "categories", "type": "STRING"},
        {"name": "category_names", "type": "STRING"},
        {"name": "weights", "type": "STRING"},
    ], 
    "partitions": [
        {"name": "dt", "type": "STRING"},
        {"name": "model_version", "type": "INTEGER"},
        {"name": "taxonomy_version", "type": "INTEGER"},
    ]
}
db = "sweeper_dev"

q = f"DROP TABLE {db}.{test_table['name']}"
execute_async(q, engine="hive")
create_table(table_definition=test_table, db=db)

temp_test_table = {
    "name": "query_top_3_humanlabels_v4_temp",
    "columns": [
        {"name": "query", "type": "STRING"},
        {"name": "categories", "type": "STRING"},
        {"name": "category_names", "type": "STRING"},
        {"name": "weights", "type": "STRING"},
        {"name": "dt", "type": "STRING"},
        {"name": "model_version", "type": "INTEGER"},
        {"name": "taxonomy_version", "type": "INTEGER"},
    ]
}
db = "sweeper_dev"

create_external_table(
    table_name=temp_test_table["name"],
    table_definition=temp_test_table,
    db=db,
    bucket=temp_bucket
)


2023-03-15 14:17:49,457 INFO: USE `default`
2023-03-15 14:17:50,348 INFO: DROP TABLE sweeper_dev.query_top_3_humanlabels_v4
2023-03-15 14:17:51,189 INFO: The query returned no records.
2023-03-15 14:17:51,592 INFO: USE `default`
2023-03-15 14:17:52,519 INFO: 
    CREATE TABLE sweeper_dev.query_top_3_humanlabels_v4 (
    query STRING,
	categories STRING,
	category_names STRING,
	weights STRING
    )
    PARTITIONED BY (dt STRING,
	model_version INTEGER,
	taxonomy_version INTEGER)
LOCATION 's3://wish-tahoe-derived-us-west-2/sweeper_dev/query_top_3_humanlabels_v4'
2023-03-15 14:17:53,371 INFO: The query returned no records.
2023-03-15 14:17:53,774 INFO: USE `default`
2023-03-15 14:17:54,662 INFO: 
    CREATE TABLE sweeper_dev.query_top_3_humanlabels_v4_temp (
    query STRING,
	categories STRING,
	category_names STRING,
	weights STRING,
	dt STRING,
	model_version INTEGER,
	taxonomy_version INTEGER
    )
    
STORED AS PARQUET
LOCATION 's3://wish-tahoe-import-us-west-2/sweeper_dev/query_to

In [17]:

upload_df_to_parquet(df_data, s3_bucket=temp_bucket, s3_key="sweeper_dev/query_top_3_humanlabels_v4_temp/data.parquet")


2023-03-15 14:18:07,544 INFO: Uploading DF (13448, 7) records to sweeper_dev/query_top_3_humanlabels_v4_temp/data.parquet.
2023-03-15 14:18:08,422 INFO: Uploaded DF (13448, 7) to sweeper_dev/query_top_3_humanlabels_v4_temp/data.parquet.


In [18]:

q = f"SELECT COUNT(*) FROM {db}.{temp_test_table['name']}"
execute_async(q)

2023-03-15 14:18:10,393 INFO: SELECT COUNT(*) FROM sweeper_dev.query_top_3_humanlabels_v4_temp


[(13448,)]

In [19]:
# Copy data from the temp table to the permanent table
# The partition field must be the last in the SELECT statement
q = f"""
INSERT INTO {db}.{test_table['name']}
SELECT query, categories, category_names, weights, dt, model_version, taxonomy_version
FROM {db}.{temp_test_table['name']}
"""
execute_async(q)

2023-03-15 14:18:16,808 INFO: 
INSERT INTO sweeper_dev.query_top_3_humanlabels_v4
SELECT query, categories, category_names, weights, dt, model_version, taxonomy_version
FROM sweeper_dev.query_top_3_humanlabels_v4_temp



[(13448,)]

In [20]:
q = f"SELECT COUNT(*) FROM {db}.{test_table['name']}"
execute_async(q)

2023-03-15 14:18:22,550 INFO: SELECT COUNT(*) FROM sweeper_dev.query_top_3_humanlabels_v4


[(13448,)]

In [21]:
# Optional: drop the external table
drop_external_table(
    db=db,
    table_name=temp_test_table["name"],
    delete_files=True,
    s3_bucket=temp_bucket,
    s3_prefix=f'{db}/{temp_test_table["name"]}', 
)

2023-03-15 14:18:30,581 INFO: USE `default`
2023-03-15 14:18:31,425 INFO: 
    DROP TABLE IF EXISTS sweeper_dev.query_top_3_humanlabels_v4_temp
    
2023-03-15 14:18:32,269 INFO: The query returned no records.
2023-03-15 14:18:32,271 INFO: Dropped sweeper_dev.query_top_3_humanlabels_v4_temp
2023-03-15 14:18:32,807 INFO: Files in 's3://wish-tahoe-import-us-west-2/sweeper_dev/query_top_3_humanlabels_v4_temp are deleted.
