# Adding the legacy stores to the main BQ table

- We have to make a few decisions here:
    - Should I only add stores that pass the tests? This would enable us to rerun the non-qc stores and maybe some of them will now work
    - For now Ill add all of them, so people have a way to beta test, but for the final version (once we have credits) we should do that. 

In [1]:
from leap_data_management_utils.cmip_utils import CMIPBQInterface

In [2]:
leap_legacy_table_id = "leap-pangeo.cmip6_pgf_ingestion.leap_legacy"
pangeo_legacy_table_id = "leap-pangeo.cmip6_pgf_ingestion.pangeo_legacy"
main_table_id = "leap-pangeo.testcmip6.cmip6_consolidated_manual_testing"

bq_leap_legacy = CMIPBQInterface(leap_legacy_table_id)
bq_pangeo_legacy = CMIPBQInterface(pangeo_legacy_table_id)
bq = CMIPBQInterface(main_table_id)

In [5]:
def filter_and_add(source_bq: CMIPBQInterface, target_bq: CMIPBQInterface):
    print(f"Getting latest results from {source_bq.table_id}")
    df_source = source_bq.get_latest()
    iids_source = df_source["instance_id"].tolist()
    print(f"Finding iids already in {target_bq.table_id}")
    iids_in_target = target_bq.iid_list_exists(iids_source)
    # filter those iids out (we assume they are more up to date)
    df_source_filtered = df_source[~df_source.instance_id.isin(iids_in_target)]

    # TODO: filter out the failed tests later
    if len(df_source_filtered) > 0:
        print(f"Adding missing entries to {target_bq.table_id}")
        target_bq.insert_df(df_source_filtered)
    return df_source_filtered

In [6]:
df_inserted_from_leap = filter_and_add(bq_leap_legacy, bq)
print(f"Inserted {len(df_inserted_from_leap)} new entries")

Inserted 0 new entries


In [7]:
df_inserted_from_pangeo = filter_and_add(bq_pangeo_legacy, bq)
print(f"Inserted {len(df_inserted_from_pangeo)} new entries")

Found over 10k iids. Working in batches.


  0%|          | 0/52 [00:00<?, ?it/s]

Found over 10k iid entries. Working in batches.


  0%|          | 0/22 [00:00<?, ?it/s]

Inserted 214455 new entries


In [8]:
df_all = bq.get_latest()

In [9]:
df_all

Unnamed: 0,instance_id,store,timestamp,retracted,tests_passed
0,CMIP6.AerChemMIP.MIROC.MIROC6.hist-piNTCF.r3i1...,gs://cmip6/CMIP6/AerChemMIP/MIROC/MIROC6/hist-...,2024-01-24 00:34:38.224384+00:00,True,True
1,CMIP6.AerChemMIP.MIROC.MIROC6.histSST.r1i1p1f1...,gs://cmip6/CMIP6/AerChemMIP/MIROC/MIROC6/histS...,2024-01-24 00:34:38.224384+00:00,True,True
2,CMIP6.AerChemMIP.MIROC.MIROC6.piClim-2xfire.r1...,gs://cmip6/CMIP6/AerChemMIP/MIROC/MIROC6/piCli...,2024-01-24 00:34:38.224384+00:00,True,True
3,CMIP6.AerChemMIP.MIROC.MIROC6.piClim-2xss.r1i1...,gs://cmip6/CMIP6/AerChemMIP/MIROC/MIROC6/piCli...,2024-01-24 00:34:38.224384+00:00,True,True
4,CMIP6.AerChemMIP.MIROC.MIROC6.piClim-NTCF.r1i1...,gs://cmip6/CMIP6/AerChemMIP/MIROC/MIROC6/piCli...,2024-01-24 00:34:38.224384+00:00,True,True
...,...,...,...,...,...
517891,CMIP6.ScenarioMIP.MOHC.UKESM1-0-LL.ssp585.r1i1...,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,2024-01-22 22:49:57.680638+00:00,False,True
517892,CMIP6.ScenarioMIP.MOHC.UKESM1-0-LL.ssp585.r3i1...,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,2024-01-22 22:49:57.680638+00:00,False,True
517893,CMIP6.ScenarioMIP.MOHC.UKESM1-0-LL.ssp585.r3i1...,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,2024-01-22 22:49:57.680638+00:00,False,True
517894,CMIP6.ScenarioMIP.MOHC.UKESM1-0-LL.ssp585.r3i1...,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,2024-01-22 22:49:57.680638+00:00,False,True
