In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

import tqdm

import glob
import os


NUM_TREES = 100
DAYS_AHEAD = 14

pd.__version__

'2.2.2'

In [12]:
def create_features(df, lags=[1, 2, 3], dropna=True):
    """
    Create lagged features for time series data.

    Parameters:
    - df (DataFrame): The input DataFrame containing the time series data.
    - lags (list): A list of integers representing the lag periods for creating features. Default is [1, 2, 3].
    - dropna (bool): Whether to drop rows with missing values after creating features. Default is True.

    Returns:
    - df_features (DataFrame): The DataFrame with lagged features created from the input data.

    """

    df_features = df.copy()
    for lag in lags:
        df_features[f"P_lag_time_{lag}"] = df_features["tp"].shift(periods=lag)

    for lag in lags:
        df_features[f"Q_lag_time_{lag}"] = df_features["obsdis"].shift(periods=lag)

    P_cols = [f"P_lag_time_{idx}" for idx in range(1, len(lags) + 1)]
    Q_cols = [f"Q_lag_time_{idx}" for idx in range(1, len(lags) + 1)]

    all_cols = ["tp", "obsdis"] + P_cols + Q_cols
    df_features = df_features[all_cols]

    if dropna:
        df_features.dropna(inplace=True)

    return df_features

In [13]:
def create_train_test_splits(df_features, split_ratio=0.8):
    """
    Split the input dataframe into training and testing sets for machine learning modeling.

    Parameters:
    - df_features (pandas.DataFrame): The input dataframe containing the features and target variable.
    - split_ratio (float, optional): The ratio of the dataset to be used for training. Default is 0.8.

    Returns:
    - x_train (pandas.DataFrame): The training features dataframe.
    - y_train (pandas.Series): The training target variable series.
    - x_test (pandas.DataFrame): The testing features dataframe.
    - y_test (pandas.Series): The testing target variable series.
    """

    df_features = create_features(df_features, lags=[1, 2, 3], dropna=True)
    df_features.head()

    split_index = int(split_ratio * df_features.shape[0])

    trainset = df_features[:split_index]
    testset = df_features[split_index:]

    x_train = trainset.drop(columns=["obsdis", "tp"])
    y_train = trainset["obsdis"]

    x_test = testset.drop(columns=["obsdis", "tp"])
    y_test = testset["obsdis"]

    return x_train, y_train, x_test, y_test

In [14]:
files = glob.glob("../../gapped_data_contiguous/*.csv")
files = files[0:10]  # just do the fist 10 stations for reference
scores = []

for file in tqdm.tqdm(files):

    df = pd.read_csv(file)
    data_filled = df.ffill()

    df_features = create_features(df, lags=[1, 2, 3], dropna=True)
    x_train, y_train, x_test, y_test = create_train_test_splits(
        df_features, split_ratio=0.8
    )

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(x_train, y_train)

    station = os.path.basename(file).split("_cleaned")[0].strip(".0")
    score = rf.score(x_test, y_test)
    scores.append((station, score))
    break
     

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:01<?, ?it/s]


In [18]:
df_features = create_features(df, lags=[1, 2, 3], dropna=True)
    

for i in range(DAYS_AHEAD-1):  # 5+1 days forecast
    df_features[f"obsdis_{i+1}"] = df_features.obsdis.shift(-1*(i+1))

df_features.dropna(inplace=True)

In [19]:
X = df_features[
    [
        "P_lag_time_1",
        "P_lag_time_2",
        "P_lag_time_3",
        "Q_lag_time_1",
        "Q_lag_time_2",
        "Q_lag_time_3",
    ]
]

target_columns = [ "obsdis"] + [f"obsdis_{i}" for i in range(1, DAYS_AHEAD)]
targets = df_features[target_columns]

cut_off = int(targets.shape[0] * 0.8)

mo_train_x = X[0:cut_off]
mo_train_y = targets[0:cut_off]


mo_test_x = X[cut_off:]
mo_test_y = targets[cut_off:]


mo_train_x.shape, mo_test_x.shape, mo_train_y.shape, mo_test_y.shape

((4356, 6), (1090, 6), (4356, 14), (1090, 14))

In [20]:
multioutputregressor = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, random_state=42)
).fit(mo_train_x, mo_train_y)

In [21]:
multioutputregressor.score(mo_test_x, mo_test_y)

0.8952165433463187

In [48]:
predictions = multioutputregressor.predict(mo_test_x)

predictions.shape

(1088, 14)

In [49]:
predictions[1], mo_test_y.iloc[1]

(array([2954.7, 2920.3, 2896.6, 2854.4, 2839.6, 2812.8, 2783.8, 2780.3,
        2767.6, 2779.5, 2731.3, 2699.9, 2678.1, 2661.3]),
 obsdis       2990.0
 obsdis_1     2950.0
 obsdis_2     2950.0
 obsdis_3     2910.0
 obsdis_4     2890.0
 obsdis_5     2870.0
 obsdis_6     2850.0
 obsdis_7     2820.0
 obsdis_8     2810.0
 obsdis_9     2780.0
 obsdis_10    2710.0
 obsdis_11    2540.0
 obsdis_12    2490.0
 obsdis_13    2470.0
 Name: 4365, dtype: float64)

In [22]:
mo_test_x

Unnamed: 0,P_lag_time_1,P_lag_time_2,P_lag_time_3,Q_lag_time_1,Q_lag_time_2,Q_lag_time_3
4376,0.488605,1.240209,2.199107,2710.0,2780.0,2810.0
4377,0.179818,0.488605,1.240209,2540.0,2710.0,2780.0
4378,0.200976,0.179818,0.488605,2490.0,2540.0,2710.0
4379,0.138875,0.200976,0.179818,2470.0,2490.0,2540.0
4380,0.155483,0.138875,0.200976,2460.0,2470.0,2490.0
...,...,...,...,...,...,...
5461,0.535394,0.504948,0.464211,2580.0,2600.0,2630.0
5462,0.863708,0.535394,0.504948,2550.0,2580.0,2600.0
5463,1.037927,0.863708,0.535394,2530.0,2550.0,2580.0
5464,1.367421,1.037927,0.863708,2490.0,2530.0,2550.0


In [26]:
import joblib
station_model_name = "mo_station_518.pkl"

joblib.dump( multioutputregressor, station_model_name )

['mo_station_518.pkl']

In [28]:
import joblib


station_model_name = "mo_station_518.pkl"
mo_regressor = joblib.load( station_model_name ) 

test_df  = pd.DataFrame( [{
    "P_lag_time_1": 2.372996,
    "P_lag_time_2": 9.947697,
    "P_lag_time_3": 0.009867,
    "Q_lag_time_1": 68.500000,
    "Q_lag_time_2": 79.199997,
    "Q_lag_time_3": 76.800003,
    
}])

mo_regressor.predict(test_df)

array([[ 879.86,  889.4 ,  908.11,  955.46,  973.27, 1036.01, 1041.62,
        1069.79, 1088.86, 1111.55, 1162.48, 1178.34, 1174.13, 1195.35]])