In [76]:
from dotenv import load_dotenv
import boto3
import os
import polars as pl
from scripts.utils import download_object, get_all_objects

In [77]:
load_dotenv()

True

In [78]:
s3 = boto3.client('s3',
    endpoint_url='https://'+os.getenv('DUCKDB_S3_ENDPOINT'),
)

In [79]:
# List all objects in the bucket
objects = s3.list_objects_v2(Bucket='archiva-apagones')
# objects

In [80]:
for obj in objects['Contents']:
    print(obj)
    break

{'Key': 'genera/data_source/2023-12-26/genera_data_source__2023-12-26T21-48-54-0400.js', 'LastModified': datetime.datetime(2023, 12, 27, 1, 48, 55, 975000, tzinfo=tzutc()), 'ETag': '"cc75570b565784e1e73e04eeef2a323c"', 'Size': 10311, 'StorageClass': 'STANDARD'}


In [81]:
objects_df = (
    pl.DataFrame(get_all_objects(s3))
    .with_columns([
        pl.when(pl.col('Key').str.starts_with('genera'))
        .then(pl.col('Key').str.split('/').list.slice(0,2).list.join('/'))
        .otherwise(pl.col('Key').str.split('/').list.get(0))
        .alias('dataset')
    ])
)
objects_df

Key,LastModified,ETag,Size,StorageClass,dataset
str,datetime[μs],str,i64,str,str
"""genera/data_so…",2023-12-27 01:48:55.975,"""""cc75570b56578…",10311,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:35:43.279,"""""29739f74bbdb6…",10305,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:40:17.820,"""""fb81ee8ba448e…",10304,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:45:15.713,"""""2ffe81b9af089…",10300,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:50:16.624,"""""4609efd807273…",10298,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 02:55:13.952,"""""fefcf9dfd4089…",10297,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:00:17.059,"""""9dc57786f9846…",10299,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:05:19.057,"""""19e51d1cbc1b3…",10298,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:10:17.064,"""""f34eedd77e21d…",10298,"""STANDARD""","""genera/data_so…"
"""genera/data_so…",2023-12-27 03:15:17.183,"""""e6fd08df8aedb…",10300,"""STANDARD""","""genera/data_so…"


In [82]:
summary_df = (
    objects_df
    .group_by('dataset')
    .agg([
        pl.count(),
        pl.sum('Size')
    ])
    .with_columns([
        (pl.col('Size')/1e6).alias('Size (MB)'),
        (pl.col('Size')/pl.col('count')).round().alias('Avg. Size'),
        (pl.col('Size')/pl.col('count') * 288 / 1e6).round(3).alias('Avg. Daily Size (MB)'),
    ])
    .sort('Size (MB)', descending=True)
)

summary_df.to_pandas()

Unnamed: 0,dataset,count,Size,Size (MB),Avg. Size,Avg. Daily Size (MB)
0,genera/data_source,6248,64051280,64.05128,10251.0,2.952
1,outage_towns,5428,18540233,18.540233,3416.0,0.984
2,regions_without_service,8383,16333211,16.333211,1948.0,0.561
3,penguins.csv,1,13478,0.013478,13478.0,3.882
4,regions_without_service.json,1,1947,0.001947,1947.0,0.561


In [83]:
print('Total bucket size:')
(
    summary_df
    .select([
        pl.col('Size (MB)').sum()
    ])
)

Total bucket size:


Size (MB)
f64
98.940149
