In [1]:
import pandas as pd
from utils import calc_data_size

df_ui = pd.read_csv("stupid_ui.csv")
df_ui["size"] = df_ui.apply(
    lambda x: calc_data_size(
        x["start_year"],
        x["end_year"],
        x["max_lat"],
        x["min_lat"],
        x["max_lon"],
        x["min_lon"],
        x["temporal_resolution"],
        x["spatial_resolution"],
    ),
    axis=1,
)
print("Interested data size:", df_ui["size"].sum(), "GB")
df_ui

Interested data size: 1121.61226272583 GB


Unnamed: 0,variable,start_year,end_year,max_lat,min_lat,max_lon,min_lon,temporal_resolution,spatial_resolution,ui_id,size
0,temperature,1981,2020,90,-90,180,-180,hour,0.25,1,560.806131
1,pressure,1981,2020,90,-90,180,-180,hour,0.25,2,560.806131


In [2]:
from utils import create_metadata_from_ui

df_meta = create_metadata_from_ui(df_ui)
df_meta["file_size"] = df_meta.apply(
    lambda x: calc_data_size(
        x["start_year"],
        x["end_year"],
        x["max_lat"],
        x["min_lat"],
        x["max_lon"],
        x["min_lon"],
        x["actual_temporal_resolution"],
        x["actual_spatial_resolution"],
    ),
    axis=1,
)
print("Total data size after pre-aggregation", df_meta["file_size"].sum(), "GB")
df_meta

Total data size after pre-aggregation 2363.6922124028206 GB


Unnamed: 0,ui_id,variable,start_year,end_year,max_lat,min_lat,max_lon,min_lon,ui_temporal_resolution,ui_spatial_resolution,actual_temporal_resolution,actual_spatial_resolution,file_size
0,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,hour,0.25,560.806131
1,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,hour,0.5,420.604599
2,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,hour,1.0,105.15115
3,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,day,0.25,70.100766
4,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,day,0.5,17.525192
5,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,day,1.0,4.381298
6,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,month,0.25,2.304683
7,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,month,0.5,0.576171
8,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,month,1.0,0.144043
9,1,temperature,1981,2020,90,-90,180,-180,hour,0.25,year,0.25,0.192057


In [3]:
# See the ratio

import duckdb

sql = """
SELECT u.*, g.agg_size, g.agg_size / u.size as ratio
FROM df_ui u, (
    SELECT u.ui_id, sum(m.file_size) as agg_size
    FROM df_ui u, df_meta m
    WHERE u.ui_id = m.ui_id
    GROUP BY u.ui_id
) g
WHERE u.ui_id = g.ui_id
"""
df_ui_with_size = duckdb.query(sql).df()
ui_size = df_ui_with_size["size"].sum()
actual_size = df_ui_with_size["agg_size"].sum()
print("ui only size:", ui_size, "GB")
print("including pre-aggregation size:", actual_size, "GB")
print("ratio:", actual_size / ui_size)
df_ui_with_size

ui only size: 1121.61226272583 GB
including pre-aggregation size: 2363.6922124028206 GB
ratio: 2.107405821917808


Unnamed: 0,variable,start_year,end_year,max_lat,min_lat,max_lon,min_lon,temporal_resolution,spatial_resolution,ui_id,size,agg_size,ratio
0,temperature,1981,2020,90,-90,180,-180,hour,0.25,1,560.806131,1181.846106,2.107406
1,pressure,1981,2020,90,-90,180,-180,hour,0.25,2,560.806131,1181.846106,2.107406
