#### In this example, we upload some sample data to Tahoe. The steps are as follows:
1. Create an external table for the data import.
2. Upload a parquet file to the S3 location for the external table.
3. Copy the data from the external table to the permanent table.

This is the common practice because Hive and Presto do not support bulk insert through SQL. 

In [2]:
from tahoe import create_table, create_external_table, drop_external_table, execute_async
from s3 import temp_bucket, get_s3_file_keys, upload_df_to_parquet
import pandas as pd
import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO
)

In [3]:
df_data = pd.read_parquet("sample_data.parquet")
print(df_data.shape)
df_data.head()

(1000, 3)


Unnamed: 0,listing_id,name,dt
0,62453dd22bab6cf58c61673f,Animal Pattern Temporary Tattoo Sticker Men Wa...,2022-03-31
1,62453fce324cee04c8b55caa,Gaiters Women's Stylish Leg Armor Covers Acces...,2022-03-31
2,6245402b70350f2946d94eee,Fashion Korea Silver Plated Demon Eye Finger R...,2022-03-31
3,62454101594260e9d77169ff,Men's / Women's Print 3D Shorts Sports Casual ...,2022-03-31
4,62454192881d18861dcd9b6a,All the Presidents' Gardens: Madison S Cabbage...,2022-03-31


In [4]:
df_data["dt"].value_counts()

2022-03-31    756
2022-04-01    244
Name: dt, dtype: int64

In [5]:
# Create the permanent table
test_table = {
    "name": "listing_import_test_123",
    "columns": [
        {"name": "listing_id", "type": "STRING"},
        {"name": "name", "type": "STRING"},
    ],
    "partitions": [
        {"name": "dt", "type": "STRING"}
    ]
}
db = "sweeper_dev"



In [6]:
q = f"DROP TABLE {db}.{test_table['name']}"
execute_async(q, engine="hive")

2022-04-05 22:22:44,917 INFO: USE `default`
2022-04-05 22:22:45,266 INFO: DROP TABLE sweeper_dev.listing_import_test_123
2022-04-05 22:22:45,507 INFO: The query returned no records.


In [7]:
create_table(table_definition=test_table, db=db)

2022-04-05 22:22:47,759 INFO: USE `default`
2022-04-05 22:22:48,125 INFO: 
    CREATE TABLE sweeper_dev.listing_import_test_123 (
    listing_id STRING,
	name STRING
    )
    PARTITIONED BY (dt STRING)
LOCATION 's3://wish-tahoe-derived-us-west-2/sweeper_dev/listing_import_test_123'
2022-04-05 22:22:48,595 INFO: The query returned no records.


In [8]:
# Create the temp import table
# We do not partition the external table to avoid the additional effort for adding data to specific partitions and register the partitions
temp_test_table = {
    "name": "listing_import_test_123_temp",
    "columns": [
        {"name": "listing_id", "type": "STRING"},
        {"name": "name", "type": "STRING"},
        {"name": "dt", "type": "STRING"}
    ]
}
db = "sweeper_dev"

create_external_table(
    table_name=temp_test_table["name"],
    table_definition=temp_test_table,
    db=db,
    bucket=temp_bucket
)

2022-04-05 22:22:52,303 INFO: USE `default`
2022-04-05 22:22:52,648 INFO: 
    CREATE TABLE sweeper_dev.listing_import_test_123_temp (
    listing_id STRING,
	name STRING,
	dt STRING
    )
    
STORED AS PARQUET
LOCATION 's3://wish-tahoe-import-us-west-2/sweeper_dev/listing_import_test_123_temp'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
2022-04-05 22:22:52,949 INFO: The query returned no records.


In [9]:
upload_df_to_parquet(df_data, s3_bucket=temp_bucket, s3_key="sweeper_dev/listing_import_test_123_temp/data.parquet")

2022-04-05 22:22:58,477 INFO: Uploading DF (1000, 3) records to sweeper_dev/listing_import_test_123_temp/data.parquet.
2022-04-05 22:22:59,828 INFO: Uploaded DF (1000, 3) to sweeper_dev/listing_import_test_123_temp/data.parquet.


In [10]:
q = f"SELECT COUNT(*) FROM {db}.{temp_test_table['name']}"
execute_async(q)

2022-04-05 22:23:03,122 INFO: SELECT COUNT(*) FROM sweeper_dev.listing_import_test_123_temp


[(1000,)]

In [11]:
# Copy data from the temp table to the permanent table
# The partition field must be the last in the SELECT statement
q = f"""
INSERT INTO {db}.{test_table['name']}
SELECT listing_id, name, dt
FROM {db}.{temp_test_table['name']}
"""
execute_async(q)

2022-04-05 22:23:22,596 INFO: 
INSERT INTO sweeper_dev.listing_import_test_123
SELECT listing_id, name, dt
FROM sweeper_dev.listing_import_test_123_temp



[(1000,)]

In [12]:
q = f"SELECT COUNT(*) FROM {db}.{test_table['name']}"
execute_async(q)

2022-04-05 22:23:37,516 INFO: SELECT COUNT(*) FROM sweeper_dev.listing_import_test_123


[(1001,)]

In [14]:
q = f"SELECT * FROM {db}.{test_table['name']} LIMIT 10"
execute_async(q)

2022-04-05 22:24:13,814 INFO: SELECT * FROM sweeper_dev.listing_import_test_123 LIMIT 10


[('62453dd22bab6cf58c61673f',
  'Animal Pattern Temporary Tattoo Sticker Men Waterproof Tattoo Sticker Accessory(HB-529 ) cvv',
  '2022-03-31'),
 ('62454101594260e9d77169ff',
  "Men's / Women's Print 3D Shorts Sports Casual Fashion",
  '2022-03-31'),
 ('62454192881d18861dcd9b6a',
  "All the Presidents' Gardens: Madison S Cabbages to Kennedy S Roses How the White House Grounds Have Grown with America (Hardcover)",
  '2022-03-31'),
 ('624541fd14cf5a580ba12140',
  '8PCS 25 * 25cm Candy Color Cotton Twill Fabric Hand DIY Splicing Cloth Pre Cut Squares Home Decoration Art 9.8 "* 9.8"',
  '2022-03-31'),
 ('6245439607d97dd8564ce612',
  '10 PCS Kabob Grilling Basket Reusable Durable -Corrosion Wooden Handle Barbecue Tool Grill Basket Grill Net',
  '2022-03-31'),
 ('624543b7d001ce4a7418aa5b',
  '3M Flexible EL Neon 10 Wire Light Colors Dance Party Decor Light',
  '2022-03-31'),
 ('6245449c67d08845b87e4077',
  '【With 1 x 5.0Ah Battery】Kamolee DUB185 Cordless Blower 20000rpm 3000W New Cordless Ha

In [16]:
# Optional: drop the external table
drop_external_table(
    db=db,
    table_name=temp_test_table["name"],
    delete_files=True,
    s3_bucket=temp_bucket,
    s3_prefix=f'{db}/{temp_test_table["name"]}', 
)

2022-04-05 22:24:48,561 INFO: USE `default`
2022-04-05 22:24:48,895 INFO: 
    DROP TABLE IF EXISTS sweeper_dev.listing_import_test_123_temp
    
2022-04-05 22:24:49,415 INFO: The query returned no records.
2022-04-05 22:24:49,416 INFO: Dropped sweeper_dev.listing_import_test_123_temp
2022-04-05 22:24:50,355 INFO: Files in 's3://wish-tahoe-import-us-west-2/sweeper_dev/listing_import_test_123_temp are deleted.
