In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

import tqdm

import glob
import os


NUM_TREES = 100
DAYS_AHEAD = 14

pd.__version__

'2.2.2'

In [21]:
def create_features(df, lags=[1, 2, 3], dropna=True):
    """
    Create lagged features for time series data.

    Parameters:
    - df (DataFrame): The input DataFrame containing the time series data.
    - lags (list): A list of integers representing the lag periods for creating features. Default is [1, 2, 3].
    - dropna (bool): Whether to drop rows with missing values after creating features. Default is True.

    Returns:
    - df_features (DataFrame): The DataFrame with lagged features created from the input data.

    """

    df_features = df.copy()
    for lag in lags:
        df_features[f"P_lag_time_{lag}"] = df_features["tp"].shift(periods=lag)

    for lag in lags:
        df_features[f"Q_lag_time_{lag}"] = df_features["obsdis"].shift(periods=lag)

    P_cols = [f"P_lag_time_{idx}" for idx in range(1, len(lags) + 1)]
    Q_cols = [f"Q_lag_time_{idx}" for idx in range(1, len(lags) + 1)]

    all_cols = ["tp", "obsdis"] + P_cols + Q_cols
    df_features = df_features[all_cols]

    if dropna:
        df_features.dropna(inplace=True)

    return df_features

In [22]:
def create_train_test_splits(df_features, split_ratio=0.8):
    """
    Split the input dataframe into training and testing sets for machine learning modeling.

    Parameters:
    - df_features (pandas.DataFrame): The input dataframe containing the features and target variable.
    - split_ratio (float, optional): The ratio of the dataset to be used for training. Default is 0.8.

    Returns:
    - x_train (pandas.DataFrame): The training features dataframe.
    - y_train (pandas.Series): The training target variable series.
    - x_test (pandas.DataFrame): The testing features dataframe.
    - y_test (pandas.Series): The testing target variable series.
    """

    df_features = create_features(df_features, lags=[1, 2, 3], dropna=True)
    df_features.head()

    split_index = int(split_ratio * df_features.shape[0])

    trainset = df_features[:split_index]
    testset = df_features[split_index:]

    x_train = trainset.drop(columns=["obsdis", "tp"])
    y_train = trainset["obsdis"]

    x_test = testset.drop(columns=["obsdis", "tp"])
    y_test = testset["obsdis"]

    return x_train, y_train, x_test, y_test

In [23]:
files = glob.glob("../../gapped_data_contiguous/*.csv")
files = files[0:10]  # just do the fist 10 stations for reference
scores = []

for file in tqdm.tqdm(files):

    df = pd.read_csv(file)
    data_filled = df.ffill()

    df_features = create_features(df, lags=[1, 2, 3], dropna=True)
    x_train, y_train, x_test, y_test = create_train_test_splits(
        df_features, split_ratio=0.8
    )

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(x_train, y_train)

    station = os.path.basename(file).split("_cleaned")[0].strip(".0")
    score = rf.score(x_test, y_test)
    scores.append((station, score))
     

100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


In [24]:
# order scores, worst on top
df_scores = pd.DataFrame(scores, columns=["station", "score"])
df_scores.sort_values(by="score", ascending=True)

Unnamed: 0,station,score
2,station_4649,0.907675
8,station_927,0.909754
6,station_4649,0.938103
7,station_1712,0.993407
5,station_1715,0.994725
0,station_958,0.995895
9,station_958,0.996001
3,station_962,0.996194
4,station_1718,0.997981
1,station_1717,0.998032


In [42]:
df_features.head()

Unnamed: 0,tp,obsdis,P_lag_time_1,P_lag_time_2,P_lag_time_3,Q_lag_time_1,Q_lag_time_2,Q_lag_time_3,obsdis_1,obsdis_2,...,obsdis_4,obsdis_5,obsdis_6,obsdis_7,obsdis_8,obsdis_9,obsdis_10,obsdis_11,obsdis_12,obsdis_13
3,0.138445,2380.0,0.110283,0.30414,0.744781,2400.0,2420.0,2460.0,2360.0,2350.0,...,2290.0,2270.0,2320.0,2380.0,2440.0,2460.0,2440.0,2390.0,2370.0,2360.0
4,0.222622,2360.0,0.138445,0.110283,0.30414,2380.0,2400.0,2420.0,2350.0,2320.0,...,2270.0,2320.0,2380.0,2440.0,2460.0,2440.0,2390.0,2370.0,2360.0,2320.0
5,0.955903,2350.0,0.222622,0.138445,0.110283,2360.0,2380.0,2400.0,2320.0,2290.0,...,2320.0,2380.0,2440.0,2460.0,2440.0,2390.0,2370.0,2360.0,2320.0,2290.0
6,2.469969,2320.0,0.955903,0.222622,0.138445,2350.0,2360.0,2380.0,2290.0,2270.0,...,2380.0,2440.0,2460.0,2440.0,2390.0,2370.0,2360.0,2320.0,2290.0,2260.0
7,0.384858,2290.0,2.469969,0.955903,0.222622,2320.0,2350.0,2360.0,2270.0,2320.0,...,2440.0,2460.0,2440.0,2390.0,2370.0,2360.0,2320.0,2290.0,2260.0,2250.0


In [43]:

for i in range(DAYS_AHEAD-1):  # 5+1 days forecast
    df_features[f"obsdis_{i+1}"] = df_features.obsdis.shift(-1*(i+1))

df_features.dropna(inplace=True)

In [44]:
df_features.shape

(5439, 21)

In [45]:
X = df_features[
    [
        "P_lag_time_1",
        "P_lag_time_2",
        "P_lag_time_3",
        "Q_lag_time_1",
        "Q_lag_time_2",
        "Q_lag_time_3",
    ]
]

target_columns = [ "obsdis"] + [f"obsdis_{i}" for i in range(1, DAYS_AHEAD)]
targets = df_features[target_columns]

cut_off = int(targets.shape[0] * 0.8)

mo_train_x = X[0:cut_off]
mo_train_y = targets[0:cut_off]


mo_test_x = X[cut_off:]
mo_test_y = targets[cut_off:]


mo_train_x.shape, mo_test_x.shape, mo_train_y.shape, mo_test_y.shape

((4351, 6), (1088, 6), (4351, 14), (1088, 14))

In [46]:
multioutputregressor = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, random_state=42)
).fit(mo_train_x, mo_train_y)

In [47]:
multioutputregressor.score(mo_test_x, mo_test_y)

0.8956160194057989

In [48]:
predictions = multioutputregressor.predict(mo_test_x)

predictions.shape

(1088, 14)

In [49]:
predictions[1], mo_test_y.iloc[1]

(array([2954.7, 2920.3, 2896.6, 2854.4, 2839.6, 2812.8, 2783.8, 2780.3,
        2767.6, 2779.5, 2731.3, 2699.9, 2678.1, 2661.3]),
 obsdis       2990.0
 obsdis_1     2950.0
 obsdis_2     2950.0
 obsdis_3     2910.0
 obsdis_4     2890.0
 obsdis_5     2870.0
 obsdis_6     2850.0
 obsdis_7     2820.0
 obsdis_8     2810.0
 obsdis_9     2780.0
 obsdis_10    2710.0
 obsdis_11    2540.0
 obsdis_12    2490.0
 obsdis_13    2470.0
 Name: 4365, dtype: float64)