In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

Load Cleaned Data

In [2]:
processed_dir = Path("../data/processed")
station1_nwm = pd.read_parquet(processed_dir / "station1_nwm.parquet")
station1_usgs = pd.read_parquet(processed_dir / "station1_usgs.parquet")
station2_nwm = pd.read_parquet(processed_dir / "station2_nwm.parquet")
station2_usgs = pd.read_parquet(processed_dir / "station2_usgs.parquet")

Defining the Processing Function

In [3]:
def create_aligned_dataframe(nwm_df, usgs_df):
    """Aligns NWM forecasts with USGS observations and calculates error."""
    # Ensure data is sorted by time for merge_asof
    nwm_df = nwm_df.sort_values(by='model_output_valid_time')
    usgs_df = usgs_df.sort_index()

    # Align NWM forecasts to the closest hourly USGS observation
    aligned_df = pd.merge_asof(
        left=nwm_df,
        right=usgs_df,
        left_on='model_output_valid_time',
        right_index=True,
        direction='nearest', # Find the closest observation in time
        tolerance=pd.Timedelta(minutes=30) # Within a 30-minute window
    )

    # Clean up the merged dataframe
    aligned_df.dropna(subset=['USGSFlowValue'], inplace=True) # Remove forecasts with no matching observation
    aligned_df = aligned_df.rename(columns={
        'streamflow_value': 'NWM_streamflow',
        'USGSFlowValue': 'USGS_streamflow'
    })
    
    return aligned_df

Process Both Stations

In [4]:
# Ensure model_output_valid_time is timezone-aware to match USGS index
station1_nwm['model_output_valid_time'] = station1_nwm['model_output_valid_time'].dt.tz_localize('UTC')
station2_nwm['model_output_valid_time'] = station2_nwm['model_output_valid_time'].dt.tz_localize('UTC')

station1_df = create_aligned_dataframe(station1_nwm, station1_usgs)
station2_df = create_aligned_dataframe(station2_nwm, station2_usgs)

print("Data alignment complete.")
station1_df.head()

Data alignment complete.


Unnamed: 0,NWM_version_number,model_initialization_time,model_output_valid_time,NWM_streamflow,streamID,lead_time,USGS_streamflow,USGS_GageID
0,v2.1,2021-04-21 00:00:00,2021-04-21 01:00:00+00:00,0.45,20380357,1.0,0.19,A
18,v2.1,2021-04-21 01:00:00,2021-04-21 02:00:00+00:00,0.44,20380357,1.0,0.18,A
1,v2.1,2021-04-21 00:00:00,2021-04-21 02:00:00+00:00,0.84,20380357,2.0,0.18,A
2,v2.1,2021-04-21 00:00:00,2021-04-21 03:00:00+00:00,1.51,20380357,3.0,0.18,A
36,v2.1,2021-04-21 02:00:00,2021-04-21 03:00:00+00:00,0.43,20380357,1.0,0.18,A


Feature Engineering

In [5]:
def create_features(df):
    """Creates the target variable and input features."""
    # Calculate our target variable: the error
    df['error'] = df['NWM_streamflow'] - df['USGS_streamflow']
    
    # Create time-based features from the forecast valid time
    valid_time = df['model_output_valid_time']
    df['month'] = valid_time.dt.month
    df['day_of_year'] = valid_time.dt.dayofyear
    df['hour'] = valid_time.dt.hour
    
    # Select the columns to be used for modeling
    # The model will learn to predict the 'error' based on the other features
    feature_cols = [
        'error', # Included here for easy splitting, will be separated later
        'NWM_streamflow', 
        'lead_time', 
        'month', 
        'day_of_year',
        'hour'
    ]
    
    df_featured = df[['model_output_valid_time'] + feature_cols].set_index('model_output_valid_time')
    
    return df_featured

station1_featured = create_features(station1_df)
station2_featured = create_features(station2_df)

print("Feature engineering complete.")
station1_featured.head()

Feature engineering complete.


Unnamed: 0_level_0,error,NWM_streamflow,lead_time,month,day_of_year,hour
model_output_valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-21 01:00:00+00:00,0.26,0.45,1.0,4,111,1
2021-04-21 02:00:00+00:00,0.26,0.44,1.0,4,111,2
2021-04-21 02:00:00+00:00,0.66,0.84,2.0,4,111,2
2021-04-21 03:00:00+00:00,1.33,1.51,3.0,4,111,3
2021-04-21 03:00:00+00:00,0.25,0.43,1.0,4,111,3


Split Data into Train and Test Sets by Date

In [6]:
train_end_date = '2022-09-30 23:59:59'
test_start_date = '2022-10-01 00:00:00'

# Split Station 1
train_df1 = station1_featured.loc[:train_end_date]
test_df1 = station1_featured.loc[test_start_date:]

# Split Station 2
train_df2 = station2_featured.loc[:train_end_date]
test_df2 = station2_featured.loc[test_start_date:]

print(f"Station 1 - Train shape: {train_df1.shape}, Test shape: {test_df1.shape}")
print(f"Station 2 - Train shape: {train_df2.shape}, Test shape: {test_df2.shape}")


Station 1 - Train shape: (235269, 6), Test shape: (90836, 6)
Station 2 - Train shape: (219249, 6), Test shape: (90836, 6)


Scale the Data

In [7]:
# Initialize a scaler for each station
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()

# Fit the scaler ONLY on the training data and transform both train and test data
train_scaled1 = scaler1.fit_transform(train_df1)
test_scaled1 = scaler1.transform(test_df1)

train_scaled2 = scaler2.fit_transform(train_df2)
test_scaled2 = scaler2.transform(test_df2)

print("\nData scaling complete. The scaler was fit on the training data only.")


Data scaling complete. The scaler was fit on the training data only.


Validate and Clean Scaled Data

In [8]:
def clean_scaled_data(scaled_data, original_columns):
    """
    Finds and removes rows containing NaN or infinity from scaled data.
    """
    df = pd.DataFrame(scaled_data, columns=original_columns)
    initial_rows = len(df)
    
    # Check for NaN or infinity
    invalid_rows_mask = df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    
    if invalid_rows_mask.any():
        num_invalid = invalid_rows_mask.sum()
        print(f"WARNING: Found and removed {num_invalid} rows with NaN/inf values ({((num_invalid/initial_rows)*100):.2f}% of data).")
        # Keep only the valid rows
        cleaned_data = scaled_data[~invalid_rows_mask]
        return cleaned_data
    else:
        print("Data validation successful. No NaN/inf values found.")
        return scaled_data

# Clean the data for each set
train_scaled1 = clean_scaled_data(train_scaled1, train_df1.columns)
test_scaled1 = clean_scaled_data(test_scaled1, test_df1.columns)
train_scaled2 = clean_scaled_data(train_scaled2, train_df2.columns)
test_scaled2 = clean_scaled_data(test_scaled2, test_df2.columns)

Data validation successful. No NaN/inf values found.
Data validation successful. No NaN/inf values found.
Data validation successful. No NaN/inf values found.


Create Sequences

In [9]:
def create_sequences(data, lookback, target_col_index):
    """Reshapes data into sequences for time series forecasting."""
    X, y = [], []
    
    for i in range(len(data) - lookback):
        # The input sequence is the 'lookback' window of all features
        X.append(data[i:(i + lookback)])
        
        # The target is the 'error' value at the end of the window
        y.append(data[i + lookback, target_col_index])
        
    return np.array(X), np.array(y)

# Define the lookback window (e.g., use the last 24 hours of data to predict the next hour)
LOOKBACK = 24 

# Find the index of our 'error' column (it's the first column, so index = 0)
target_col_idx = train_df1.columns.get_loc('error') 

# Create sequences for Station 1
X_train1, y_train1 = create_sequences(train_scaled1, LOOKBACK, target_col_idx)
X_test1, y_test1 = create_sequences(test_scaled1, LOOKBACK, target_col_idx)

# Create sequences for Station 2
X_train2, y_train2 = create_sequences(train_scaled2, LOOKBACK, target_col_idx)
X_test2, y_test2 = create_sequences(test_scaled2, LOOKBACK, target_col_idx)

print(f"Sequence creation complete. Example shape for Station 1 Training X: {X_train1.shape}")
print(f"This means: ({X_train1.shape[0]} samples, {X_train1.shape[1]} timesteps/hours, {X_train1.shape[2]} features)")

Sequence creation complete. Example shape for Station 1 Training X: (235244, 24, 6)
This means: (235244 samples, 24 timesteps/hours, 6 features)


Save Final Processed Data

In [10]:
output_dir = Path("../data/processed")
np.savez(output_dir / 'station1_processed_for_modeling.npz', X_train=X_train1, y_train=y_train1, X_test=X_test1, y_test=y_test1)
np.savez(output_dir / 'station2_processed_for_modeling.npz', X_train=X_train2, y_train=y_train2, X_test=X_test2, y_test=y_test2)

# We also need to save the scalers to inverse the transformation later
import pickle
with open(output_dir / 'scaler_station1.pkl', 'wb') as f:
    pickle.dump(scaler1, f)
with open(output_dir / 'scaler_station2.pkl', 'wb') as f:
    pickle.dump(scaler2, f)
    
print(f"\nFinal data saved to {output_dir}")


Final data saved to ..\data\processed
