In [22]:
import pandas as pd 
import dvc.api 
from tqdm import tqdm

In [23]:
df_preds = pd.read_json(dvc.api.get_url( 
    'modelling/notebooks/quali_analyze_queryclassify/tahoe_20230311_v2v3_res.json',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
), lines=True, chunksize=1000)

In [24]:
df_tax = pd.read_json(dvc.api.get_url( 
    'datasets/data/taxonomy/wish_newtax.json',
    repo='git@github.com:ContextLogic/multitask-llm-rnd.git'
), lines=True)

In [25]:
tax2id = {}
tax2name = {}
for i in df_tax.to_dict('records'):
    if len(i['category_path']) > 0:
        tax2id[i['category_path'].lower()] = str(i['id'])
        tax2name[i['category_path'].lower()] = i['category_path'].split(' > ')[-1]

In [26]:
upload_recs = []
for df_pred in tqdm(df_preds):
    for rec in df_pred.to_dict('records'):
        categories = []
        weights = []
        names = []
        for cat, wt in rec['preds_v3']:
            if cat not in tax2id:
                assert cat == 'unknown'
                break 
            else:
                categories.append(tax2id[cat])
                weights.append(str(wt))
                names.append(tax2name[cat])
        upload_rec = { 
            'query': rec['query'], 
            'categories': ','.join(categories),
            'category_names': ','.join(names),
            'weights': ','.join(weights),
            'dt': '2023-03-14',
            'model_version': 3, 
            'taxonomy_version': 121
        }
        upload_recs.append(upload_rec)
df_upload = pd.DataFrame(upload_recs)

6628it [38:26,  2.87it/s]


In [27]:
df_upload.sample(2)

Unnamed: 0,query,categories,category_names,weights,dt,model_version,taxonomy_version
6576261,alexa türschloss,3583354546554653358735903544465747034700,"Door Locks,Smart Home Controls,Electric Lock,D...","0.4993504286,0.2808485329,0.0398609601,0.02397...",2023-03-14,3,121
4283251,watches black and gold,599459725969599359755974599259762665,"Women's Watches,Men's Watches,Watches,Women's ...","0.8134278655,0.6058135629,0.0718427449,0.07124...",2023-03-14,3,121


In [28]:
from tahoe import create_table, create_external_table, drop_external_table, execute_async
from s3 import temp_bucket, get_s3_file_keys, upload_df_to_parquet
import pandas as pd
import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO
)

In [29]:
df_data = df_upload

In [31]:
len(df_data)

6627925

In [30]:
# Create the permanent table
test_table = {
    "name": "query_top_10_prediction_v3",
    "columns": [
        {"name": "query", "type": "STRING"},
        {"name": "categories", "type": "STRING"},
        {"name": "category_names", "type": "STRING"},
        {"name": "weights", "type": "STRING"},
    ], 
    "partitions": [
        {"name": "dt", "type": "STRING"},
        {"name": "model_version", "type": "INTEGER"},
        {"name": "taxonomy_version", "type": "INTEGER"},
    ]
}
db = "sweeper_dev"

q = f"DROP TABLE {db}.{test_table['name']}"
execute_async(q, engine="hive")
create_table(table_definition=test_table, db=db)

temp_test_table = {
    "name": "query_top_10_prediction_v3_temp",
    "columns": [
        {"name": "query", "type": "STRING"},
        {"name": "categories", "type": "STRING"},
        {"name": "category_names", "type": "STRING"},
        {"name": "weights", "type": "STRING"},
        {"name": "dt", "type": "STRING"},
        {"name": "model_version", "type": "INTEGER"},
        {"name": "taxonomy_version", "type": "INTEGER"},
    ]
}
db = "sweeper_dev"

create_external_table(
    table_name=temp_test_table["name"],
    table_definition=temp_test_table,
    db=db,
    bucket=temp_bucket
)


2023-03-14 23:48:19,926 INFO: USE `default`
2023-03-14 23:48:20,927 INFO: DROP TABLE sweeper_dev.query_top_10_prediction_v3
2023-03-14 23:48:21,703 INFO: The query returned no records.
2023-03-14 23:48:22,062 INFO: USE `default`
2023-03-14 23:48:22,943 INFO: 
    CREATE TABLE sweeper_dev.query_top_10_prediction_v3 (
    query STRING,
	categories STRING,
	category_names STRING,
	weights STRING
    )
    PARTITIONED BY (dt STRING,
	model_version INTEGER,
	taxonomy_version INTEGER)
LOCATION 's3://wish-tahoe-derived-us-west-2/sweeper_dev/query_top_10_prediction_v3'
2023-03-14 23:48:23,766 INFO: The query returned no records.
2023-03-14 23:48:24,158 INFO: USE `default`
2023-03-14 23:48:25,047 INFO: 
    CREATE TABLE sweeper_dev.query_top_10_prediction_v3_temp (
    query STRING,
	categories STRING,
	category_names STRING,
	weights STRING,
	dt STRING,
	model_version INTEGER,
	taxonomy_version INTEGER
    )
    
STORED AS PARQUET
LOCATION 's3://wish-tahoe-import-us-west-2/sweeper_dev/query_to

In [32]:

upload_df_to_parquet(df_data, s3_bucket=temp_bucket, s3_key="sweeper_dev/query_top_10_prediction_v3_temp/data.parquet")


2023-03-14 23:48:40,682 INFO: Uploading DF (6627925, 7) records to sweeper_dev/query_top_10_prediction_v3_temp/data.parquet.
2023-03-14 23:49:25,062 INFO: Uploaded DF (6627925, 7) to sweeper_dev/query_top_10_prediction_v3_temp/data.parquet.


In [33]:

q = f"SELECT COUNT(*) FROM {db}.{temp_test_table['name']}"
execute_async(q)

2023-03-14 23:49:25,190 INFO: SELECT COUNT(*) FROM sweeper_dev.query_top_10_prediction_v3_temp


[(6627925,)]

In [35]:
# Copy data from the temp table to the permanent table
# The partition field must be the last in the SELECT statement
q = f"""
INSERT INTO {db}.{test_table['name']}
SELECT query, categories, category_names, weights, dt, model_version, taxonomy_version
FROM {db}.{temp_test_table['name']}
"""
execute_async(q)

2023-03-14 23:52:13,157 INFO: 
INSERT INTO sweeper_dev.query_top_10_prediction_v3
SELECT query, categories, category_names, weights, dt, model_version, taxonomy_version
FROM sweeper_dev.query_top_10_prediction_v3_temp



[(6627925,)]

In [36]:
q = f"SELECT COUNT(*) FROM {db}.{test_table['name']}"
execute_async(q)

2023-03-14 23:52:53,288 INFO: SELECT COUNT(*) FROM sweeper_dev.query_top_10_prediction_v3


[(6627925,)]

In [37]:
# Optional: drop the external table
drop_external_table(
    db=db,
    table_name=temp_test_table["name"],
    delete_files=True,
    s3_bucket=temp_bucket,
    s3_prefix=f'{db}/{temp_test_table["name"]}', 
)

2023-03-14 23:53:06,473 INFO: USE `default`
2023-03-14 23:53:07,276 INFO: 
    DROP TABLE IF EXISTS sweeper_dev.query_top_10_prediction_v3_temp
    
2023-03-14 23:53:08,076 INFO: The query returned no records.
2023-03-14 23:53:08,078 INFO: Dropped sweeper_dev.query_top_10_prediction_v3_temp
2023-03-14 23:53:08,568 INFO: Files in 's3://wish-tahoe-import-us-west-2/sweeper_dev/query_top_10_prediction_v3_temp are deleted.
