In [44]:
from dotenv import load_dotenv
import boto3
import os
import polars as pl
from scripts.utils import download_object, get_all_objects

In [2]:
load_dotenv()

True

In [3]:
s3 = boto3.client('s3',
    endpoint_url='https://'+os.getenv('DUCKDB_S3_ENDPOINT'),
)

In [4]:
# List all objects in the bucket
objects = s3.list_objects_v2(Bucket='archiva-apagones')
# objects

'./archiva-apagones/regions_without_service/2023-12-18'

In [47]:
# Parameters for function
local_bucket_path = './archiva-apagones'
prefix = 'regions_without_service'

# Create local bucket path, if not already there
os.makedirs(local_bucket_path, exist_ok=True)

# Get list of objects in bucket
obj_list = list(get_all_objects(s3, prefix))
bucket_obj_keys = [obj['Key'] for obj in obj_list]

# Get list of objects locally
local_obj_keys = []
for root, dirs, files in os.walk(local_bucket_path):
    for file in files:
        # Add to local_obj_list but strip off the local_bucket_path
        obj_key = os.path.join(root, file).replace(local_bucket_path, '')[1:]
        local_obj_keys.append(obj_key)

# Determine which (prefixed) objects are missing locally
missing_keys = set(bucket_obj_keys).difference(set(local_obj_keys))
print(f'{len(missing_keys)} objects missing locally')

# Download missing objects
for obj_key in missing_keys:
    local_path = os.path.join(local_bucket_path, obj_key)
    download_object(s3, obj_key, local_path)

63 objects missing locally
Downloading regions_without_service/2024-01-15/regions_without_service__2024-01-15T19-40-16-0400.json to ./archiva-apagones/regions_without_service/2024-01-15/regions_without_service__2024-01-15T19-40-16-0400.json
Downloading regions_without_service/2024-01-15/regions_without_service__2024-01-15T19-25-15-0400.json to ./archiva-apagones/regions_without_service/2024-01-15/regions_without_service__2024-01-15T19-25-15-0400.json
Downloading regions_without_service/2024-01-15/regions_without_service__2024-01-15T19-00-18-0400.json to ./archiva-apagones/regions_without_service/2024-01-15/regions_without_service__2024-01-15T19-00-18-0400.json
Downloading regions_without_service/2024-01-15/regions_without_service__2024-01-15T17-35-13-0400.json to ./archiva-apagones/regions_without_service/2024-01-15/regions_without_service__2024-01-15T17-35-13-0400.json
Downloading regions_without_service/2024-01-15/regions_without_service__2024-01-15T18-30-15-0400.json to ./archiva-ap

In [34]:


# Get list of objects locally

        

# local_obj_keys

{'regions_without_service/2024-01-15/regions_without_service__2024-01-15T16-30-15-0400.json',
 'regions_without_service/2024-01-15/regions_without_service__2024-01-15T16-35-14-0400.json',
 'regions_without_service/2024-01-15/regions_without_service__2024-01-15T16-40-15-0400.json'}

In [35]:
for obj in objects['Contents']:
    print(obj)
    break

{'Key': 'genera/data_source/2023-12-26/genera_data_source__2023-12-26T21-48-54-0400.js', 'LastModified': datetime.datetime(2023, 12, 27, 1, 48, 55, 975000, tzinfo=tzutc()), 'ETag': '"cc75570b565784e1e73e04eeef2a323c"', 'Size': 10311, 'StorageClass': 'STANDARD'}


In [14]:
pl.DataFrame(get_all_objects(prefix='regions_without_service'))

Key,LastModified,ETag,Size,StorageClass
str,datetime[μs],str,i64,str
"""regions_withou…",2023-12-19 03:32:04.204,"""""eccf2f1e0180a…",1947,"""STANDARD"""
"""regions_withou…",2023-12-19 03:52:43.471,"""""f6cad08fa8afd…",1945,"""STANDARD"""
"""regions_withou…",2023-12-19 03:56:59.371,"""""d152fb66669bb…",1945,"""STANDARD"""
"""regions_withou…",2023-12-19 04:44:08.715,"""""5455ed8fb0e3d…",1948,"""STANDARD"""
"""regions_withou…",2023-12-19 04:54:11.658,"""""49dcf20caa8a9…",1953,"""STANDARD"""
"""regions_withou…",2023-12-19 13:57:58.433,"""""183f612a92c7e…",1956,"""STANDARD"""
"""regions_withou…",2023-12-19 14:05:32.201,"""""440270f3b6742…",1956,"""STANDARD"""
"""regions_withou…",2023-12-19 14:23:48.494,"""""4769815d33432…",1954,"""STANDARD"""
"""regions_withou…",2023-12-19 14:28:23.457,"""""9a62e847f01fc…",1954,"""STANDARD"""
"""regions_withou…",2023-12-19 14:30:20.426,"""""a92f0297045a7…",1954,"""STANDARD"""


In [12]:
objects_df = (
    pl.DataFrame(get_all_objects())
    .with_columns([
        pl.when(pl.col('Key').str.starts_with('genera'))
        .then(pl.col('Key').str.split('/').list.slice(0,2).list.join('/'))
        .otherwise(pl.col('Key').str.split('/').list.get(0))
        .alias('dataset')
    ])
)
objects_df

Key,LastModified,ETag,Size,StorageClass,dataset
str,datetime[μs],str,i64,str,str
"""genera/data_so…",2023-12-27 01:48:55.975,"""""cc75570b56578…",10311,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:35:43.279,"""""29739f74bbdb6…",10305,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:40:17.820,"""""fb81ee8ba448e…",10304,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:45:15.713,"""""2ffe81b9af089…",10300,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:50:16.624,"""""4609efd807273…",10298,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:55:13.952,"""""fefcf9dfd4089…",10297,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:00:17.059,"""""9dc57786f9846…",10299,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:05:19.057,"""""19e51d1cbc1b3…",10298,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:10:17.064,"""""f34eedd77e21d…",10298,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:15:17.183,"""""e6fd08df8aedb…",10300,"""STANDARD""","""genera/data_so…"


In [8]:
(
    objects_df
    .group_by('dataset')
    .agg([
        pl.count(),
        pl.sum('Size')
    ])
    .with_columns([
        (pl.col('Size')/1e6).alias('Size (MB)'),
        (pl.col('Size')/pl.col('count')).round().alias('Avg. Size'),
        (pl.col('Size')/pl.col('count') * 288 / 1e6).round(3).alias('Avg. Daily Size (MB)'),
    ])
    .sort('Size (MB)', descending=True)
).to_pandas()

Unnamed: 0,dataset,count,Size,Size (MB),Avg. Size,Avg. Daily Size (MB)
0,genera/data_source,5635,57756124,57.756124,10250.0,2.952
1,outage_towns,4815,16306378,16.306378,3387.0,0.975
2,regions_without_service,7770,15138822,15.138822,1948.0,0.561
3,penguins.csv,1,13478,0.013478,13478.0,3.882
4,regions_without_service.json,1,1947,0.001947,1947.0,0.561
