In [1]:
import sys
import os

# Get the absolute path of parent folder
current_dir = os.path.abspath("")
parent_dir = os.path.join(current_dir, os.pardir)

# Add to sys.path
sys.path.append(parent_dir)

In [2]:
import ml_combat as ml
from ml_combat import data
from ml_combat.MetaModel import MetaModel

In [3]:
from prophet import Prophet
import pandas as pd
import numpy as np

In [4]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [5]:
df = ml.data.get_training_cleaned()
dropcols = ['snow_density:kgm3',
    'cloud_base_agl:m',
    'ceiling_height_agl:m', 'weather_data_type']
df.drop(columns=dropcols, inplace=True)
df.dropna(axis=0, subset='absolute_humidity_2m:gm3', inplace=True)

In [6]:
train_data = TimeSeriesDataFrame.from_data_frame(
    df.rename(columns={'y': 'target'}),
    id_column="location",
    timestamp_column="ds",
)

In [7]:
predictor = TimeSeriesPredictor(
    prediction_length=48,
    path="autogluon-m4-hourly",
    target="target",
    eval_metric="MASE",
    ignore_time_index=True
)

predictor.fit(
    train_data,
    presets="medium_quality",
    time_limit=60,
)

TimeSeriesPredictor.fit() called
Setting presets to: medium_quality
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'MASE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'medium_quality',
 'num_val_windows': 1,
 'prediction_length': 48,
 'random_seed': None,
 'target': 'target',
 'time_limit': 60,
 'verbosity': 2}
Provided training data set with 89712 rows, 3 items (item = single time series). Average time series length is 29904.0. Data frequency is 'S'.
AutoGluon will save models to autogluon-m4-hourly/
AutoGluon will gauge predictive performance using evaluation metric: 'MASE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'target'
	past covariates:  ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'dew_or_rime:id

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x29f8ed060>

In [None]:
import matplotlib.pyplot as plt
import numpy as np

item_id = "H1"
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=[10, 4], sharex=True)
train_ts = train_data.loc[item_id]
test_ts = test_data.loc[item_id]
ax1.set_title("Train data (past time series values)")
ax1.plot(train_ts)
ax2.set_title("Test data (past + future time series values)")
ax2.plot(test_ts)
for ax in (ax1, ax2):
    ax.fill_between(np.array([train_ts.index[-1], test_ts.index[-1]]), test_ts.min(), test_ts.max(), color="C1", alpha=0.3, label="Forecast horizon")
plt.legend()
plt.show()

In [8]:
for loc in ['A', 'B', 'C']:
    df['y_lagged_1yr'] = df.y.shift(-365*34)

Unnamed: 0,location,ds,y,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,...,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms
0,A,2019-06-02 22:00:00,0.00,7.700,1.22825,0.000000,0.000,0.0,280.299988,0.000,...,348.036758,-3.77425,0.000,286.224998,100.000000,40386.475586,3.600,-3.575,-0.500,0.0
1,A,2019-06-02 23:00:00,0.00,7.700,1.22350,0.000000,0.000,0.0,280.299988,0.000,...,91.980751,-4.35725,0.000,286.899994,100.000000,33770.649414,3.350,-3.350,0.275,0.0
2,A,2019-06-03 00:00:00,0.00,7.875,1.21975,0.000000,0.000,0.0,280.650002,0.000,...,14.934750,-3.30950,0.000,286.949997,100.000000,13595.500000,3.050,-2.950,0.750,0.0
3,A,2019-06-03 01:00:00,0.00,8.425,1.21800,208.649994,0.750,0.0,281.674995,0.300,...,28.630250,-0.82250,0.000,286.750000,100.000000,2321.850037,2.725,-2.600,0.875,0.0
4,A,2019-06-03 02:00:00,19.36,8.950,1.21800,32468.150269,23.100,0.0,282.500000,11.975,...,41.997500,3.05125,0.000,286.449997,99.225000,11634.799683,2.550,-2.350,0.925,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101358,C,2023-04-30 19:00:00,50.96,4.400,1.27550,84010.151367,4.175,0.0,272.025002,2.775,...,304.936501,-0.20150,0.000,274.925003,97.725000,25027.999512,4.075,3.600,1.875,0.0
101359,C,2023-04-30 20:00:00,2.94,4.400,1.27850,2206.800018,0.000,0.0,271.949997,0.000,...,318.620499,-5.20400,0.000,274.575005,95.850002,23995.599609,3.600,2.950,2.125,0.0
101360,C,2023-04-30 21:00:00,0.00,4.400,1.27900,0.000000,0.000,0.0,271.899994,0.000,...,332.780251,-8.98450,0.025,274.399994,95.925001,23068.600098,3.600,2.625,2.400,0.0
101361,C,2023-04-30 22:00:00,0.00,4.400,1.27975,0.000000,0.000,0.0,271.949997,0.000,...,347.373245,-11.27050,0.125,274.225006,99.425001,11856.700195,3.275,2.325,2.325,0.0


In [9]:
df = df[df.location == 'A']
# Calculate the lag based on one year
one_year_lag = pd.DateOffset(years=1)

# Create a one-year lagged feature
df['ds_lagged_1yr'] = df['ds'] - one_year_lag

# Merge the lagged feature back to the original DataFrame
lagged_df = df.merge(df[['ds', 'y']], left_on='ds_lagged_1yr', right_on='ds', how='left')
lagged_df = lagged_df.rename(columns={'y_x': 'y', 'y_y': 'y_lagged_1yr'})
lagged_df = lagged_df.drop(columns=['ds_y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ds_lagged_1yr'] = df['ds'] - one_year_lag


In [13]:
lagged_df.corr()

  lagged_df.corr()


Unnamed: 0,y,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,...,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,y_lagged_1yr
y,1.0,0.237477,-0.367847,0.798949,0.811664,-0.059754,0.247368,0.708496,0.701035,0.867413,...,0.68804,-0.119872,0.348635,-0.193119,0.245593,-0.08186,-0.057263,-0.273952,-0.005179,0.674802
absolute_humidity_2m:gm3,0.237477,1.0,-0.811902,0.366492,0.363609,0.134031,0.974455,0.295421,0.301467,0.244952,...,0.435043,0.143552,0.907406,0.057273,-0.157999,-0.137928,0.085977,-0.250425,0.006652,0.27403
air_density_2m:kgm3,-0.367847,-0.811902,1.0,-0.474561,-0.464314,-0.068119,-0.845149,-0.403717,-0.41645,-0.384466,...,-0.498803,-0.092952,-0.914967,-0.124434,-0.004826,-0.036096,-0.011786,0.140529,-0.002249,-0.36041
clear_sky_energy_1h:J,0.798949,0.366492,-0.474561,1.0,0.992466,-0.07242,0.370643,0.912896,0.923219,0.725529,...,0.837526,0.025676,0.443913,0.001641,0.081559,0.010348,0.075354,-0.341231,-0.004584,0.806224
clear_sky_rad:W,0.811664,0.363609,-0.464314,0.992466,1.0,-0.071797,0.367717,0.918973,0.915726,0.732751,...,0.841102,0.026736,0.430447,0.001741,0.072618,0.000892,0.070268,-0.326764,-0.00652,0.818656
dew_or_rime:idx,-0.059754,0.134031,-0.068119,-0.07242,-0.071797,1.0,0.134111,-0.073139,-0.074761,-0.050493,...,-0.063692,-0.027722,0.119601,-0.024626,-0.033868,-0.103045,-0.031543,0.002082,-0.001627,-0.060534
dew_point_2m:K,0.247368,0.974455,-0.845149,0.370643,0.367717,0.134111,1.0,0.303208,0.309298,0.246306,...,0.439539,0.175667,0.923649,0.093788,-0.182689,-0.091685,0.12719,-0.235942,0.00851,0.278184
diffuse_rad:W,0.708496,0.295421,-0.403717,0.912896,0.918973,-0.073139,0.303208,1.0,0.988092,0.563216,...,0.806737,-0.053111,0.35753,0.085366,0.095721,-0.001287,0.068402,-0.298511,-0.004533,0.758452
diffuse_rad_1h:J,0.701035,0.301467,-0.41645,0.923219,0.915726,-0.074761,0.309298,0.988092,1.0,0.564083,...,0.807722,-0.049323,0.372564,0.080312,0.103443,0.006169,0.072707,-0.314099,-0.002331,0.749349
direct_rad:W,0.867413,0.244952,-0.384466,0.725529,0.732751,-0.050493,0.246306,0.563216,0.564083,1.0,...,0.606938,-0.182468,0.380728,-0.292501,0.309994,-0.095268,-0.094143,-0.276244,-0.003078,0.591861


In [19]:
df = ml.data.get_training_cleaned()

lagged_dfs = []  # A list to store lagged DataFrames for each location

# Calculate the lag based on one year
one_year_lag = pd.DateOffset(years=1)

# Iterate over unique locations
for location in df['location'].unique():
    location_df = df[df['location'] == location]  # Filter by location
    
    # Create a one-year lagged feature for this location
    location_df['ds_lagged_1yr'] = location_df['ds'] - one_year_lag
    
    # Merge the lagged feature back to the original DataFrame
    lagged_location_df = location_df.merge(location_df[['ds', 'y']], left_on='ds_lagged_1yr', right_on='ds', how='left')
    lagged_location_df = lagged_location_df.rename(columns={'y_x': 'y', 'y_y': 'y_lagged_1yr', 'ds_x': 'ds'})
    lagged_location_df = lagged_location_df.drop(columns=['ds_y', 'ds_lagged_1yr'])
    
    lagged_dfs.append(lagged_location_df)

# Combine all the lagged DataFrames for different locations
result_df = pd.concat(lagged_dfs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df['ds_lagged_1yr'] = location_df['ds'] - one_year_lag
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df['ds_lagged_1yr'] = location_df['ds'] - one_year_lag
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df['ds_lagged_1yr'] = location_df['ds'] - one_year_lag


In [20]:
result_df

Unnamed: 0,location,ds,y,weather_data_type,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,...,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,y_lagged_1yr
0,A,2019-06-02 22:00:00,0.00,observed,7.700,1.22825,1728.950012,0.000000,0.000,1728.950012,...,-3.77425,0.000,286.224998,100.000000,40386.475586,3.600,-3.575,-0.500,0.0,
1,A,2019-06-02 23:00:00,0.00,observed,7.700,1.22350,1689.824982,0.000000,0.000,1689.824982,...,-4.35725,0.000,286.899994,100.000000,33770.649414,3.350,-3.350,0.275,0.0,
2,A,2019-06-03 00:00:00,0.00,observed,7.875,1.21975,1563.225006,0.000000,0.000,1563.225006,...,-3.30950,0.000,286.949997,100.000000,13595.500000,3.050,-2.950,0.750,0.0,
3,A,2019-06-03 01:00:00,0.00,observed,8.425,1.21800,1283.425018,208.649994,0.750,1283.425018,...,-0.82250,0.000,286.750000,100.000000,2321.850037,2.725,-2.600,0.875,0.0,
4,A,2019-06-03 02:00:00,19.36,observed,8.950,1.21800,1003.500000,32468.150269,23.100,1003.500000,...,3.05125,0.000,286.449997,99.225000,11634.799683,2.550,-2.350,0.925,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26021,C,2023-04-30 19:00:00,50.96,estimated,4.400,1.27550,1456.574982,84010.151367,4.175,551.224991,...,-0.20150,0.000,274.925003,97.725000,25027.999512,4.075,3.600,1.875,0.0,
26022,C,2023-04-30 20:00:00,2.94,estimated,4.400,1.27850,1476.350006,2206.800018,0.000,564.099991,...,-5.20400,0.000,274.575005,95.850002,23995.599609,3.600,2.950,2.125,0.0,
26023,C,2023-04-30 21:00:00,0.00,estimated,4.400,1.27900,1516.299988,0.000000,0.000,578.699997,...,-8.98450,0.025,274.399994,95.925001,23068.600098,3.600,2.625,2.400,0.0,
26024,C,2023-04-30 22:00:00,0.00,estimated,4.400,1.27975,1240.600006,0.000000,0.000,551.500000,...,-11.27050,0.125,274.225006,99.425001,11856.700195,3.275,2.325,2.325,0.0,
