In [2]:
# Import necessary libraries for data manipulation and analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split


 Load Processed Data for Station 1 and Station 2

In [3]:
# Load the preprocessed data for Station 1 and Station 2
station1_data = pd.read_csv('../data/processed/station1_processed.csv')
station2_data = pd.read_csv('../data/processed/station2_processed.csv')

# Display the first few rows of each dataset to understand their structure
print("Station 1 Data:")
display(station1_data.head())

print("Station 2 Data:")
display(station2_data.head())


Station 1 Data:


Unnamed: 0,model_initialization_time,model_output_valid_time,NWM_streamflow,USGS_streamflow
0,2021-04-21 00:00:00,2021-04-21 01:00:00,0.45,0.19
1,2021-04-21 00:00:00,2021-04-21 01:00:00,0.45,0.19
2,2021-04-21 00:00:00,2021-04-21 01:00:00,0.45,0.19
3,2021-04-21 00:00:00,2021-04-21 02:00:00,0.84,0.19
4,2021-04-21 00:00:00,2021-04-21 02:00:00,0.84,0.19


Station 2 Data:


Unnamed: 0,model_initialization_time,model_output_valid_time,NWM_streamflow,USGS_streamflow
0,2021-04-21 00:00:00,2021-04-21 01:00:00,30.729999,31.43
1,2021-04-21 00:00:00,2021-04-21 01:00:00,30.729999,31.15
2,2021-04-21 00:00:00,2021-04-21 01:00:00,30.729999,31.43
3,2021-04-21 00:00:00,2021-04-21 02:00:00,31.379999,31.15
4,2021-04-21 00:00:00,2021-04-21 02:00:00,31.379999,31.15


Handling Missing Data

In [4]:
# Check for missing values in both datasets
print("Missing values in Station 1 Data:")
display(station1_data.isnull().sum())

print("Missing values in Station 2 Data:")
display(station2_data.isnull().sum())

# Handle missing values by filling them with appropriate strategies (e.g., forward fill, median, etc.)
station1_data = station1_data.fillna(method='ffill')
station2_data = station2_data.fillna(method='ffill')

# Confirm that there are no more missing values
print("Station 1 Data (after handling missing values):")
display(station1_data.isnull().sum())

print("Station 2 Data (after handling missing values):")
display(station2_data.isnull().sum())


Missing values in Station 1 Data:


model_initialization_time    0
model_output_valid_time      0
NWM_streamflow               5
USGS_streamflow              0
dtype: int64

Missing values in Station 2 Data:


model_initialization_time      0
model_output_valid_time        0
NWM_streamflow                 0
USGS_streamflow              270
dtype: int64

  station1_data = station1_data.fillna(method='ffill')
  station2_data = station2_data.fillna(method='ffill')


Station 1 Data (after handling missing values):


model_initialization_time    0
model_output_valid_time      0
NWM_streamflow               0
USGS_streamflow              0
dtype: int64

Station 2 Data (after handling missing values):


model_initialization_time    0
model_output_valid_time      0
NWM_streamflow               0
USGS_streamflow              0
dtype: int64

Feature Engineering and Transformation

In [5]:
# Convert datetime columns to pandas datetime format
station1_data['model_output_valid_time'] = pd.to_datetime(station1_data['model_output_valid_time'])
station1_data['model_initialization_time'] = pd.to_datetime(station1_data['model_initialization_time'])
station2_data['model_output_valid_time'] = pd.to_datetime(station2_data['model_output_valid_time'])
station2_data['model_initialization_time'] = pd.to_datetime(station2_data['model_initialization_time'])

# Extract useful time features such as day, month, and year from the datetime column
station1_data['year'] = station1_data['model_output_valid_time'].dt.year
station1_data['month'] = station1_data['model_output_valid_time'].dt.month
station1_data['day'] = station1_data['model_output_valid_time'].dt.day

station2_data['year'] = station2_data['model_output_valid_time'].dt.year
station2_data['month'] = station2_data['model_output_valid_time'].dt.month
station2_data['day'] = station2_data['model_output_valid_time'].dt.day

# Display the updated data with new features
print("Station 1 Data with extracted time features:")
display(station1_data.head())

print("Station 2 Data with extracted time features:")
display(station2_data.head())


Station 1 Data with extracted time features:


Unnamed: 0,model_initialization_time,model_output_valid_time,NWM_streamflow,USGS_streamflow,year,month,day
0,2021-04-21,2021-04-21 01:00:00,0.45,0.19,2021,4,21
1,2021-04-21,2021-04-21 01:00:00,0.45,0.19,2021,4,21
2,2021-04-21,2021-04-21 01:00:00,0.45,0.19,2021,4,21
3,2021-04-21,2021-04-21 02:00:00,0.84,0.19,2021,4,21
4,2021-04-21,2021-04-21 02:00:00,0.84,0.19,2021,4,21


Station 2 Data with extracted time features:


Unnamed: 0,model_initialization_time,model_output_valid_time,NWM_streamflow,USGS_streamflow,year,month,day
0,2021-04-21,2021-04-21 01:00:00,30.729999,31.43,2021,4,21
1,2021-04-21,2021-04-21 01:00:00,30.729999,31.15,2021,4,21
2,2021-04-21,2021-04-21 01:00:00,30.729999,31.43,2021,4,21
3,2021-04-21,2021-04-21 02:00:00,31.379999,31.15,2021,4,21
4,2021-04-21,2021-04-21 02:00:00,31.379999,31.15,2021,4,21


Merge Data for Model Input (Station Data + Forecast Data)

In [8]:
# Merge the USGS and NWM data for both stations on 'model_output_valid_time'
station1_merged = pd.merge(station1_data[['model_output_valid_time', 'NWM_streamflow']], 
                            station1_data[['model_output_valid_time', 'USGS_streamflow']], 
                            on='model_output_valid_time', how='inner')

station2_merged = pd.merge(station2_data[['model_output_valid_time', 'NWM_streamflow']], 
                            station2_data[['model_output_valid_time', 'USGS_streamflow']], 
                            on='model_output_valid_time', how='inner')

# Check if the merge was successful and preview the merged data
print("Station 1 Merged Data:")
print(station1_merged.head())

print("Station 2 Merged Data:")
print(station2_merged.head())


Station 1 Merged Data:
  model_output_valid_time  NWM_streamflow  USGS_streamflow
0     2021-04-21 01:00:00            0.45             0.19
1     2021-04-21 01:00:00            0.45             0.19
2     2021-04-21 01:00:00            0.45             0.19
3     2021-04-21 01:00:00            0.45             0.19
4     2021-04-21 01:00:00            0.45             0.19
Station 2 Merged Data:
  model_output_valid_time  NWM_streamflow  USGS_streamflow
0     2021-04-21 01:00:00       30.729999            31.43
1     2021-04-21 01:00:00       30.729999            31.15
2     2021-04-21 01:00:00       30.729999            31.43
3     2021-04-21 01:00:00       30.729999            31.43
4     2021-04-21 01:00:00       30.729999            31.15


 Data Normalization and Scaling

In [9]:
# Normalize or scale the numerical features for better model performance
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize the streamflow and other numeric columns
station1_merged[['NWM_streamflow', 'USGS_streamflow']] = scaler.fit_transform(station1_merged[['NWM_streamflow', 'USGS_streamflow']])
station2_merged[['NWM_streamflow', 'USGS_streamflow']] = scaler.fit_transform(station2_merged[['NWM_streamflow', 'USGS_streamflow']])

# Display the normalized data
print("Station 1 Normalized Data:")
display(station1_merged.head())

print("Station 2 Normalized Data:")
display(station2_merged.head())


Station 1 Normalized Data:


Unnamed: 0,model_output_valid_time,NWM_streamflow,USGS_streamflow
0,2021-04-21 01:00:00,0.000196,0.034739
1,2021-04-21 01:00:00,0.000196,0.034739
2,2021-04-21 01:00:00,0.000196,0.034739
3,2021-04-21 01:00:00,0.000196,0.034739
4,2021-04-21 01:00:00,0.000196,0.034739


Station 2 Normalized Data:


Unnamed: 0,model_output_valid_time,NWM_streamflow,USGS_streamflow
0,2021-04-21 01:00:00,0.236816,0.229462
1,2021-04-21 01:00:00,0.236816,0.227403
2,2021-04-21 01:00:00,0.236816,0.229462
3,2021-04-21 01:00:00,0.236816,0.229462
4,2021-04-21 01:00:00,0.236816,0.227403


Split Data into Training and Testing Sets

In [11]:
# Step 1: Extract Year, Month, and Day from 'model_output_valid_time'
station1_merged['year'] = station1_merged['model_output_valid_time'].dt.year
station1_merged['month'] = station1_merged['model_output_valid_time'].dt.month
station1_merged['day'] = station1_merged['model_output_valid_time'].dt.day

station2_merged['year'] = station2_merged['model_output_valid_time'].dt.year
station2_merged['month'] = station2_merged['model_output_valid_time'].dt.month
station2_merged['day'] = station2_merged['model_output_valid_time'].dt.day

# Step 2: Prepare features (X) and target variable (y) for model training
X_station1 = station1_merged[['NWM_streamflow', 'year', 'month', 'day']]  # Example features
y_station1 = station1_merged['USGS_streamflow']  # Target variable

X_station2 = station2_merged[['NWM_streamflow', 'year', 'month', 'day']]  # Example features
y_station2 = station2_merged['USGS_streamflow']  # Target variable

# Split the data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_station1, y_station1, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_station2, y_station2, test_size=0.2, random_state=42)

# Display the shape of the resulting splits
print(f"Station 1 - Training set shape: {X_train1.shape}, Test set shape: {X_test1.shape}")
print(f"Station 2 - Training set shape: {X_train2.shape}, Test set shape: {X_test2.shape}")


Station 1 - Training set shape: (83954373, 4), Test set shape: (20988594, 4)
Station 2 - Training set shape: (78324279, 4), Test set shape: (19581070, 4)


Save Processed Data for Model Training

In [14]:
# Save the processed data for further use (as Parquet files for speed and size efficiency)
station1_merged.to_parquet('../data/processed/station1_final_processed.parquet', index=False)
station2_merged.to_parquet('../data/processed/station2_final_processed.parquet', index=False)

print("Processed data saved for Station 1 and Station 2 (Parquet format).")

Processed data saved for Station 1 and Station 2 (Parquet format).


Summary

In [None]:
# Provide a brief summary of the preprocessing
print("Data Preprocessing Completed!")
print("Features and target variable are ready for training.")
