In [1]:
import os
import botocore
import boto3
import boto3.s3.transfer as s3transfer
import time
import duckdb
from dotenv import load_dotenv
from scripts.utils import download_object, get_all_objects

# Para probar que podamos conectarnos al bucket correctamente

In [2]:
load_dotenv()  # take environment variables from .env.

True

In [3]:
con = duckdb.connect('databases/sample.db')
# con.close() # to terminate connection

In [4]:
botocore_config = botocore.config.Config(max_pool_connections=20)
s3 = boto3.client('s3',
    endpoint_url='https://'+os.getenv('DUCKDB_S3_ENDPOINT'),
    config=botocore_config
)

transfer_config = s3transfer.TransferConfig(
    use_threads=True,
    max_concurrency=20,
)

s3t = s3transfer.create_transfer_manager(s3, transfer_config)

In [5]:
print('Probando desde duckdb:')
res = duckdb.sql(
'''
select *
from read_csv_auto('s3://archiva-apagones/penguins.csv')   
''')
print(res)

Probando desde duckdb:
┌─────────┬───────────┬────────────────┬───────────────┬───────────────────┬─────────────┬─────────┐
│ species │  island   │ bill_length_mm │ bill_depth_mm │ flipper_length_mm │ body_mass_g │   sex   │
│ varchar │  varchar  │     double     │    double     │       int64       │    int64    │ varchar │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼─────────┤
│ Adelie  │ Torgersen │           39.1 │          18.7 │               181 │        3750 │ MALE    │
│ Adelie  │ Torgersen │           39.5 │          17.4 │               186 │        3800 │ FEMALE  │
│ Adelie  │ Torgersen │           40.3 │          18.0 │               195 │        3250 │ FEMALE  │
│ Adelie  │ Torgersen │           NULL │          NULL │              NULL │        NULL │ NULL    │
│ Adelie  │ Torgersen │           36.7 │          19.3 │               193 │        3450 │ FEMALE  │
│ Adelie  │ Torgersen │           39.3 │          20.6 │            

In [6]:
# Parameters for function
local_bucket_path = './archiva-apagones' # Probs more useful as env var
prefix = 'regions_without_service'

# Create local bucket path, if not already there
os.makedirs(local_bucket_path, exist_ok=True)

# Get list of objects in bucket
obj_list = list(get_all_objects(s3, prefix))
bucket_obj_keys = [obj['Key'] for obj in obj_list]

# Get list of objects locally
local_obj_keys = []
for root, dirs, files in os.walk(local_bucket_path):
    for file in files:
        # Add to local_obj_list but strip off the local_bucket_path
        obj_key = os.path.join(root, file).replace(local_bucket_path, '')[1:]
        local_obj_keys.append(obj_key)

# Determine which (prefixed) objects are missing locally
missing_keys = set(bucket_obj_keys).difference(set(local_obj_keys))
print(f'{len(missing_keys)} objects missing locally')



767 objects missing locally


In [7]:
# Download missing objects
start_time = time.time()
for obj_key in missing_keys:
    local_path = os.path.join(local_bucket_path, obj_key)
    download_object(s3t, obj_key, local_path, verbose=True)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Downloading regions_without_service/2024-01-16/regions_without_service__2024-01-16T19-45-18-0400.json to ./archiva-apagones/regions_without_service/2024-01-16/regions_without_service__2024-01-16T19-45-18-0400.json
Downloading regions_without_service/2024-01-16/regions_without_service__2024-01-16T15-20-16-0400.json to ./archiva-apagones/regions_without_service/2024-01-16/regions_without_service__2024-01-16T15-20-16-0400.json
Downloading regions_without_service/2024-01-15/regions_without_service__2024-01-15T15-00-14-0400.json to ./archiva-apagones/regions_without_service/2024-01-15/regions_without_service__2024-01-15T15-00-14-0400.json
Downloading regions_without_service/2024-01-15/regions_without_service__2024-01-15T04-15-15-0400.json to ./archiva-apagones/regions_without_service/2024-01-15/regions_without_service__2024-01-15T04-15-15-0400.json
Downloading regions_without_service/2024-01-16/regions_without_service__2024-01-16T21-20-14-0400.json to ./archiva-apagones/regions_without_serv

In [8]:
con.execute(
'''
create or replace table raw_regions_without_service as (
    select *
    -- from read_json('s3://archiva-apagones/regions_without_service/2023-12-19/*.json', filename=true, auto_detect=true, format='auto') 
    from read_json('./archiva-apagones/regions_without_service/*/*.json', filename=true, auto_detect=true, format='auto') 
)
''')
# select * from read_json('samples/regions_without_service/*.json', filename=true, auto_detect=true, format='auto')

<duckdb.duckdb.DuckDBPyConnection at 0x106298130>

In [9]:
# con.close()

In [10]:
duckdb.sql(
'''
describe
    select
        *
    from raw_regions_without_service
''')

CatalogException: Catalog Error: Table with name raw_regions_without_service does not exist!
Did you mean "pg_views"?
LINE 5:     from raw_regions_without_service
                 ^

In [None]:
"%Y-%m-%dT%H-%M-%S%z"
duckdb.sql(
f'''
    select
        strptime("timestamp", '%m/%d/%Y %I:%M %p') as "marca_hora_presentada",
        -- totals.totalClients as clientes,
        -- totals.totalClientsWithService as clientes_con_servicio,
        -- totals.totalClientsWithoutService as clientes_sin_servicio,
        filename
            .string_split('__')[2]
            .regexp_extract('(.*).json', 1)
            .strptime('%Y-%m-%dT%H-%M-%S%z')
            ::TIMESTAMP -- drop timezone
            as "marca_hora_accedida",
        regions,
        totals,
        filename
            .string_split('{local_bucket_path}')[2]
            .ltrim('/')
            as object_key
        , -- TODO: Check que sea versatil para local y github actions workflow
        
    from raw_regions_without_service
''')

┌──────────────────────┬─────────────────────┬──────────────────────┬──────────────────────┬───────────────────────────┐
│ marca_hora_present…  │ marca_hora_accedida │       regions        │        totals        │        object_key         │
│      timestamp       │      timestamp      │ struct("name" varc…  │ struct(totalclient…  │          varchar          │
├──────────────────────┼─────────────────────┼──────────────────────┼──────────────────────┼───────────────────────────┤
│ 2023-12-25 11:50:00  │ 2023-12-25 11:50:18 │ [{'name': Arecibo,…  │ {'totalClients': 1…  │ regions_without_service…  │
│ 2023-12-25 05:25:00  │ 2023-12-25 05:25:18 │ [{'name': Arecibo,…  │ {'totalClients': 1…  │ regions_without_service…  │
│ 2023-12-25 13:30:00  │ 2023-12-25 13:30:20 │ [{'name': Arecibo,…  │ {'totalClients': 1…  │ regions_without_service…  │
│ 2023-12-25 21:30:00  │ 2023-12-25 21:30:18 │ [{'name': Arecibo,…  │ {'totalClients': 1…  │ regions_without_service…  │
│ 2023-12-25 02:55:00  │ 2023-12

In [None]:
duckdb.sql(
'''
describe
    select
        regions[1]
    from raw_regions_without_service
''')

┌─────────────┬────────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │                          column_type                           │  null   │   key   │ default │  extra  │
│   varchar   │                            varchar                             │ varchar │ varchar │ varchar │ varchar │
├─────────────┼────────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
│ regions[1]  │ STRUCT("name" VARCHAR, percentageClientsWithService DOUBLE, …  │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴────────────────────────────────────────────────────────────────┴─────────┴─────────┴─────────┴─────────┘

In [None]:
duckdb.sql(
'''
    select
        regions[1],
        "timestamp",
        totals,
        filename
    from raw_regions_without_service
''')

┌──────────────────────┬─────────────────────┬──────────────────────┬──────────────────────────────────────────────────┐
│      regions[1]      │      timestamp      │        totals        │                     filename                     │
│ struct("name" varc…  │       varchar       │ struct(totalclient…  │                     varchar                      │
├──────────────────────┼─────────────────────┼──────────────────────┼──────────────────────────────────────────────────┤
│ {'name': Arecibo, …  │ 12/19/2023 12:40 AM │ {'totalClients': 1…  │ s3://archiva-apagones/regions_without_service/…  │
│ {'name': Arecibo, …  │ 12/19/2023 12:50 AM │ {'totalClients': 1…  │ s3://archiva-apagones/regions_without_service/…  │
│ {'name': Arecibo, …  │ 12/19/2023 09:55 AM │ {'totalClients': 1…  │ s3://archiva-apagones/regions_without_service/…  │
│ {'name': Arecibo, …  │ 12/19/2023 10:05 AM │ {'totalClients': 1…  │ s3://archiva-apagones/regions_without_service/…  │
│ {'name': Arecibo, …  │ 12/19/2