In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pyarrow.dataset as ds
import pyarrow as pa

import duckdb

pd.set_option('display.max_columns', None)

# Description

The general process for updating these is to:

0) Read configs for each dataset
1) Read in the data already collected and stored in partitions (use duckdb for reads)
    - Get the data for the `n_latest_dates`
    - `df_current`: to use later for filtering out duplicate data already stored
2) Load all new data parquet files and concatenate.
3) Perform all of the requisite preprocessing steps for that type of data
4) Filter out duplicates already in `df_current`
5) Save using pd.to_parquet() with partition_cols

# 0 - Configuration

In [24]:
from src.galton.data_collection.partition_configs import config

In [33]:
n_latest_dates = 5
data_path = "/Users/luketownsend/Desktop/projects/tetlock/data/"

select_dataset = "openmeteo_forecasts"

dataset_name = config[select_dataset]["dataset_name"]
start_date_field = config[select_dataset]["start_date_field"]
start_date_format = config[select_dataset]["start_date_format"]
new_data_file_prefixes = config[select_dataset]["new_data_file_prefixes"]
record_index_fields = config[select_dataset]["record_index_fields"]
output_columns = config[select_dataset]["output_columns"]
partition_columns = config[select_dataset]["partition_columns"]

# 1 - Read Existing Data

In [42]:
from src.galton.data_collection.current_data import load_current_data

In [43]:
df_current = load_current_data(
    dataset_name=dataset_name, start_date_field=start_date_field, n_latest_dates=n_latest_dates
)

In [44]:
df_current[start_date_field].value_counts()

date
2025-12-13    437207
2025-12-12    437097
2025-12-14    419412
2025-12-15    266921
2025-12-16     98561
Name: count, dtype: int64

In [45]:
df_current["date"].min()

'2025-12-12'

# 2 Load New Data

In [46]:
from src.galton.data_collection.file_search import (
    enumerate_date_range,
    build_file_stem_candidates,
)

dates = enumerate_date_range(
    start_date=df_current["date"].min(),
    start_date_format=start_date_format,
)

candidates = build_file_stem_candidates(
    prefixes=new_data_file_prefixes,
    dates=dates,
)

In [47]:
candidates

['multi_model_forecast - 2025-12-12',
 'multi_model_forecast - 2025-12-13',
 'multi_model_forecast - 2025-12-14']

Start here -- Late PM Session

In [48]:
dataset = ds.dataset(data_path, format="parquet")

filenames = dataset.files
filtered = [f for f in filenames if any(p in f for p in candidates)]
print(len(filtered))

frames = []
for file in filtered:
    temp_df = pd.read_parquet(file)
    frames.append(temp_df)

df = pd.concat(frames)

print(df.shape)

366
(2951424, 7)


In [10]:
df.head()

Unnamed: 0,forecast_date,temperature_2m,city,model_timestamp,current_timestamp,model_name,model_id
0,2025-12-11 18:00:00-06:00,62.283203,Austin,2025-12-12 00:00:00-06:00,2025-12-12 00:01:30.167663-06:00,best_match,1
1,2025-12-11 19:00:00-06:00,58.1432,Austin,2025-12-12 00:00:00-06:00,2025-12-12 00:01:30.167663-06:00,best_match,1
2,2025-12-11 20:00:00-06:00,54.813198,Austin,2025-12-12 00:00:00-06:00,2025-12-12 00:01:30.167663-06:00,best_match,1
3,2025-12-11 21:00:00-06:00,52.923199,Austin,2025-12-12 00:00:00-06:00,2025-12-12 00:01:30.167663-06:00,best_match,1
4,2025-12-11 22:00:00-06:00,51.4832,Austin,2025-12-12 00:00:00-06:00,2025-12-12 00:01:30.167663-06:00,best_match,1


# 3 - Preprocess New Data

In [11]:
# TODO: Refactor & move into dedicated module or add to data_collection.openmeteo?

from src.galton.data_collection.utilities import (
    normalize_field_names,
    convert_datetime_to_utc,
)
from src.galton.feature_engineering.dates import add_date_fields
from src.galton.feature_engineering.forecasts import (
    add_forecast_fields,
    filter_redundant_forecasts,
    filter_unused_forecast_data,
)

df = normalize_field_names(df)
df = add_date_fields(df)
df = add_forecast_fields(df)

print(df.shape)

df = filter_unused_forecast_data(df)
df = filter_redundant_forecasts(df)

df = convert_datetime_to_utc(df)

df["forecast_horizon"] = df["forecast_horizon"].astype(int)

print(df.shape)

datetime_cols = ["datetime", "model_timestamp", "current_timestamp"]

for col in datetime_cols:
    df[col] = df[col].dt.tz_convert("America/Chicago")

(2894976, 15)
(1108533, 16)


# 4 - Filter Existing Records from New Data 

In [12]:
current_index = df_current.set_index(record_index_fields).index

df = df.set_index(record_index_fields)

df = df[~df.index.isin(current_index)].reset_index()

print(df.shape)

df = df[output_columns]

(254933, 16)


# 5 - Save New Data to Partition

In [13]:
df.to_parquet(f"data/local_data/{partition_folder_name}", engine="pyarrow", partition_cols=partition_columns)