In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import layers, callbacks, models
from sklearn.preprocessing import StandardScaler
import os
import datetime

Set random seeds for reproducibility

In [5]:
np.random.seed(42)
tf.random.set_seed(42)

Load processed data

In [10]:
station1_scaled = pd.read_parquet('../data/processed/station1_final_processed.parquet')
station2_scaled = pd.read_parquet('../data/processed/station2_final_processed.parquet')

station1_original = pd.read_csv('../data/processed/station1_processed.csv')
station2_original = pd.read_csv('../data/processed/station2_processed.csv')

Convert datetime columns

In [11]:
for df in [station1_original, station2_original]:
    df['model_output_valid_time'] = pd.to_datetime(df['model_output_valid_time'])
    df['model_initialization_time'] = pd.to_datetime(df['model_initialization_time'])

Merge scaled and original data to recover model_initialization_time

In [13]:
init_time_mapping_station1 = station1_original.set_index('model_output_valid_time')['model_initialization_time'].to_dict()
init_time_mapping_station2 = station2_original.set_index('model_output_valid_time')['model_initialization_time'].to_dict()

station1_scaled['model_initialization_time'] = station1_scaled['model_output_valid_time'].map(init_time_mapping_station1)
station2_scaled['model_initialization_time'] = station2_scaled['model_output_valid_time'].map(init_time_mapping_station2)

Calculate lead_time (in hours)

In [16]:
station1_scaled['lead_time'] = (station1_scaled['model_output_valid_time'] - station1_scaled['model_initialization_time']).dt.total_seconds() / 3600
station2_scaled['lead_time'] = (station2_scaled['model_output_valid_time'] - station2_scaled['model_initialization_time']).dt.total_seconds() / 3600

Add residual column

In [17]:
station1_scaled['residual'] = (station1_scaled['USGS_streamflow'] - station1_scaled['NWM_streamflow'])
station2_scaled['residual'] = (station2_scaled['USGS_streamflow'] - station2_scaled['NWM_streamflow'])

Combine station1 and station2 data

In [18]:
combined = pd.concat([station1_scaled, station2_scaled], ignore_index=True)

Split into train, validation, test

In [None]:
full_data = combined.sort_values('model_output_valid_time')

train_frac = 0.7
val_frac = 0.15
test_frac = 0.15

n = len(full_data)
train_end = int(n * train_frac)
val_end = int(n * (train_frac + val_frac))

train_data = full_data.iloc[:train_end]
val_data = full_data.iloc[train_end:val_end]
test_data = full_data.iloc[val_end:]

X_train = train_data[feature_cols].values
y_train = train_data[target_col].values

X_val = val_data[feature_cols].values
y_val = val_data[target_col].values

X_test = test_data[feature_cols].values
y_test = test_data[target_col].values
