In [1]:
import pandas as pd
from utils import calc_data_size

df_ui = pd.read_csv("smart_ui.csv")
df_ui["size"] = df_ui.apply(
    lambda x: calc_data_size(
        x["start_year"],
        x["end_year"],
        x["max_lat"],
        x["min_lat"],
        x["max_lon"],
        x["min_lon"],
        x["temporal_resolution"],
        x["spatial_resolution"],
    ),
    axis=1,
)
print("Interested data size:", df_ui["size"].sum(), "GB")
df_ui

Interested data size: 1026.6881427168846 GB


Unnamed: 0,variable,start_year,end_year,max_lat,min_lat,max_lon,min_lon,temporal_resolution,spatial_resolution,ui_id,size
0,temperature,1981,2020,90,0,0,-180,hour,0.25,1,140.201533
1,temperature,1981,2020,90,0,180,0,hour,0.25,2,140.201533
2,temperature,1981,2020,0,-90,0,-180,day,0.5,3,4.381298
3,temperature,1981,2020,0,-90,180,0,day,0.5,4,4.381298
4,temperature,1941,1980,90,0,0,-180,month,0.5,5,0.144043
5,temperature,1941,1980,90,0,180,0,month,0.5,6,0.144043
6,temperature,1941,1980,0,-90,0,-180,year,1.0,7,0.003001
7,temperature,1941,1980,0,-90,180,0,year,1.0,8,0.003001
8,precipitation,1981,2020,90,0,0,-180,hour,0.25,9,140.201533
9,precipitation,1981,2020,90,0,180,0,hour,0.25,10,140.201533


In [2]:
from utils import create_metadata_from_ui

df_meta = create_metadata_from_ui(df_ui)
df_meta["file_size"] = df_meta.apply(
    lambda x: calc_data_size(
        x["start_year"],
        x["end_year"],
        x["max_lat"],
        x["min_lat"],
        x["max_lon"],
        x["min_lon"],
        x["actual_temporal_resolution"],
        x["actual_spatial_resolution"],
    ),
    axis=1,
)
print("Total data size after pre-aggregation", df_meta["file_size"].sum(), "GB")
df_meta

Total data size after pre-aggregation 2128.0564084276557 GB


Unnamed: 0,ui_id,variable,start_year,end_year,max_lat,min_lat,max_lon,min_lon,ui_temporal_resolution,ui_spatial_resolution,actual_temporal_resolution,actual_spatial_resolution,file_size
0,1,temperature,1981,2020,90,0,0,-180,hour,0.25,hour,0.25,140.201533
1,1,temperature,1981,2020,90,0,0,-180,hour,0.25,hour,0.50,105.151150
2,1,temperature,1981,2020,90,0,0,-180,hour,0.25,hour,1.00,26.287787
3,1,temperature,1981,2020,90,0,0,-180,hour,0.25,day,0.25,17.525192
4,1,temperature,1981,2020,90,0,0,-180,hour,0.25,day,0.50,4.381298
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,30,wind,1941,1980,90,0,180,0,month,0.50,month,1.00,0.036011
173,30,wind,1941,1980,90,0,180,0,month,0.50,year,0.50,0.012004
174,30,wind,1941,1980,90,0,180,0,month,0.50,year,1.00,0.003001
175,31,wind,1941,1980,0,-90,0,-180,year,1.00,year,1.00,0.003001


In [3]:
# See the ratio

import duckdb

sql = """
SELECT u.*, g.agg_size, g.agg_size / u.size as ratio
FROM df_ui u, (
    SELECT u.ui_id, sum(m.file_size) as agg_size
    FROM df_ui u, df_meta m
    WHERE u.ui_id = m.ui_id
    GROUP BY u.ui_id
) g
WHERE u.ui_id = g.ui_id
"""
df_ui_with_size = duckdb.query(sql).df()
ui_size = df_ui_with_size["size"].sum()
actual_size = df_ui_with_size["agg_size"].sum()
print("ui only size:", ui_size, "GB")
print("including pre-aggregation size:", actual_size, "GB")
print("ratio:", actual_size / ui_size)
df_ui_with_size

ui only size: 1026.6881427168846 GB
including pre-aggregation size: 2128.0564084276557 GB
ratio: 2.0727388579712858


Unnamed: 0,variable,start_year,end_year,max_lat,min_lat,max_lon,min_lon,temporal_resolution,spatial_resolution,ui_id,size,agg_size,ratio
0,temperature,1981,2020,90,0,0,-180,hour,0.25,1,140.201533,295.461527,2.107406
1,temperature,1981,2020,90,0,180,0,hour,0.25,2,140.201533,295.461527,2.107406
2,temperature,1981,2020,0,-90,0,-180,day,0.5,3,4.381298,5.67168,1.294521
3,temperature,1981,2020,0,-90,180,0,day,0.5,4,4.381298,5.67168,1.294521
4,temperature,1941,1980,90,0,0,-180,month,0.5,5,0.144043,0.195058,1.354167
5,temperature,1941,1980,90,0,180,0,month,0.5,6,0.144043,0.195058,1.354167
6,temperature,1941,1980,0,-90,0,-180,year,1.0,7,0.003001,0.003001,1.0
7,temperature,1941,1980,0,-90,180,0,year,1.0,8,0.003001,0.003001,1.0
8,precipitation,1981,2020,90,0,0,-180,hour,0.25,9,140.201533,295.461527,2.107406
9,precipitation,1981,2020,90,0,180,0,hour,0.25,10,140.201533,295.461527,2.107406
