In [23]:
import pandas as pd
from darts import TimeSeries
import numpy as np



import warnings
warnings.filterwarnings("ignore")

def fix_datetime(X, name):
    """
    Function to fix and standardize datetime in the given DataFrame.
    
    Parameters:
    - X: DataFrame to be modified.
    - name: String representing the name of the DataFrame, used for logging.
    
    Returns:
    - Modified DataFrame with standardized datetime.
    """

    # Convert 'date_forecast' to datetime format and replace original column with 'ds'
    X['ds'] = pd.to_datetime(X['date_forecast'])
    X.drop(columns=['date_forecast'], inplace=True, errors='ignore')

    # Sort DataFrame by the new datetime column ('ds') and set it as the index
    X.sort_values(by='ds', inplace=True)
    X.set_index('ds', inplace=True)

    # Log the shape of the DataFrame before dropping rows with in-between minutes
    print(f"Shape of {name} before dropping in-between hour rows: ", X.shape)

    # Identify and log gaps in the date sequence
    print(f"HEIHEI: {name} gaps in dates: ", X.index.to_series().diff().dt.total_seconds().gt(60*15).sum())
    print(f"HEIHEI: {name} first gap in dates: ", X[X.index.to_series().diff().dt.total_seconds().gt(60*15)==True].index[:1])

    # Calculate and log the size of each gap in the date sequence
    temp = X.index.to_series().diff().dt.total_seconds()
    if temp.shape[0] > 0:
        print(f"HEIHEI: {name} list of size (in days) of each gap: ", temp[temp.gt(60*15)].values / (60*60*24))
    
    # temporarily transform into darts time series to fill missing dates
    # get date_calc if date_calc is column in X
    temp_calc = None
    if "date_calc" in X.columns:
        temp_calc = X["date_calc"]
        X.drop(columns=['date_calc'], inplace=True)
    X = TimeSeries.from_dataframe(df=X, freq="15T", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    if temp_calc is not None:
        X["date_calc"] = temp_calc

    print(f"HEIHEI: {name} gaps in dates after filling missing dates: ", X.index.to_series().diff().dt.total_seconds().gt(60*15).sum())


    # Drop rows where the minute part of the time is not 0
    X = X[X.index.minute == 0]

    # Log the shape of the DataFrame after dropping rows with in-between minutes
    print(f"Shape of {name} after dropping in-between hour rows: ", X.shape)

    return X



def convert_to_datetime(X_train_observed, X_train_estimated, X_test, y_train):
    X_train_observed = fix_datetime(X_train_observed, "X_train_observed")
    X_train_estimated = fix_datetime(X_train_estimated, "X_train_estimated")
    X_test = fix_datetime(X_test, "X_test")


    X_train_observed["estimated_diff_hours"] = 0
    X_train_estimated["estimated_diff_hours"] = (X_train_estimated.index - pd.to_datetime(X_train_estimated["date_calc"])).dt.total_seconds() / 3600.0
    X_test["estimated_diff_hours"] = (X_test.index - pd.to_datetime(X_test["date_calc"])).dt.total_seconds() / 3600.0

    X_train_estimated.drop(columns=['date_calc'], inplace=True)
    X_test.drop(columns=['date_calc'], inplace=True)

    y_train['ds'] = pd.to_datetime(y_train['time'])
    y_train.drop(columns=['time'], inplace=True)
    y_train.sort_values(by='ds', inplace=True)
    y_train.set_index('ds', inplace=True)

    return X_train_observed, X_train_estimated, X_test, y_train




# location_map = {
#     "A": 0,
#     "B": 1,
#     "C": 2
# }


def preprocess_data(X_train_observed, X_train_estimated, X_test, y_train, location):
    # convert to datetime
    X_train_observed, X_train_estimated, X_test, y_train = convert_to_datetime(X_train_observed, X_train_estimated, X_test, y_train)


    # # cast all columns to float64
    # X_train = X_train.astype('float64')
    # X_test = X_test.astype('float64')


    print(f"X_train_observed shape: {X_train_observed.shape}")
    print(f"X_train_estimated shape: {X_train_estimated.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")



    y_train["y"] = y_train["pv_measurement"].astype('float64')
    y_train.drop(columns=['pv_measurement'], inplace=True)
    print("y_train columns: ", y_train.columns)

    # temporarily transform into darts time series to fill missing dates
    print("Shape of y_train before filling missing dates: ", y_train.shape)
    y_train = TimeSeries.from_dataframe(df=y_train, freq="H", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    print("Shape of y_train after filling missing dates: ", y_train.shape)


    # number of gaps in X_train_observed + X_train_estimated before
    print(f"LOOK: Number of gaps in X_train_observed plus number of gaps in X_train_estimated before: ", X_train_observed.index.to_series().diff().dt.total_seconds().gt(3600).sum() + X_train_estimated.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    X_train = pd.concat([X_train_observed, X_train_estimated])
    print(f"LOOK: Number of gaps in X_train_observed plus number of gaps in X_train_estimated after: ", X_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    # print size of gaps in X_train
    temp = X_train.index.to_series().diff().dt.total_seconds()
    if temp.shape[0] > 0:
        print("LOOK: list of size (in days) of each gap: ", temp[temp.gt(3600)].values / (60*60*24))
    print("if the number is bigger after than before that means there is a gap in time between the observed and estimated training sets")

    # print info on dates in X_train, and if there are any missing dates
    print("X_train dates info: ", X_train.index.min(), X_train.index.max(), X_train.index.max() - X_train.index.min())
    print("X_test dates info: ", X_test.index.min(), X_test.index.max(), X_test.index.max() - X_test.index.min())
    print("y_train dates info: ", y_train.index.min(), y_train.index.max(), y_train.index.max() - y_train.index.min())

    # any gaps in dates?
    print("X_train gaps in dates: ", X_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    print("X_test gaps in dates: ", X_test.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    print("y_train gaps in dates: ", y_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())

    # temporarily transform into darts time series to fill missing dates
    X_train = TimeSeries.from_dataframe(df=X_train, freq="H", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    X_test = TimeSeries.from_dataframe(df=X_test, freq="H", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    print("X_train gaps in dates after filling missing dates: ", X_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    print("X_test gaps in dates after filling missing dates: ", X_test.index.to_series().diff().dt.total_seconds().gt(3600).sum())

    

    # clip all y values to 0 if negative
    y_train["y"] = y_train["y"].clip(lower=0)
    
    # print Number of missing values in X train
    print("Number of missing values in X_train: ", X_train.isnull().sum().sum())
    print("Number of missing values in X_test: ", X_test.isnull().sum().sum())
    # y_train missing values
    print("Number of missing values in y_train: ", y_train.isnull().sum().sum())
    X_train = pd.merge(X_train, y_train, how="outer", left_index=True, right_index=True)
    print("Number of missing values in X_train after merging with y_train: ", X_train.drop(columns=['y']).isnull().sum().sum())



    X_train["location"] = location
    X_test["location"] = location
    
    return X_train, X_test
    


# Define locations
locations = ['A', 'B', 'C']

X_trains = []
X_tests = []
y_trains = []
# Loop through locations
for loc in locations:
    print("\n\n")
    print(f"Processing location {loc}...")
    # Read target training data
    y_train = pd.read_parquet(f'{loc}/train_targets.parquet')
    
    # Read estimated training data and add location feature
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    
    # Read observed training data and add location feature
    X_train_observed= pd.read_parquet(f'{loc}/X_train_observed.parquet')

    # Read estimated test data and add location feature
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')
    
    # Concatenate observed and estimated datasets for each location
    #X_train = pd.concat([X_train_estimated, X_train_observed])
    



    # Preprocess data
    X_train, X_test = preprocess_data(X_train_observed, X_train_estimated, X_test_estimated, y_train, loc)

    print(f"Final shape of X_train for location {loc}: ", X_train.shape)
    print(f"Final shape of X_test for location {loc}: ", X_test.shape)

    # print(y_train.head(), y_train.shape)
    # print(X_train.head(), X_train.shape)
    # print(X_train.head(), X_train.shape)
    # print(type(X_train['y']))

    # Save data to csv
    X_train.to_csv(f'{loc}/X_train.csv', index=True)
    X_test.to_csv(f'{loc}/X_test.csv', index=True)


    X_trains.append(X_train)
    X_tests.append(X_test)

# Concatenate all data and save to csv
X_train = pd.concat(X_trains)
X_test = pd.concat(X_tests)

print(f"Final shape of X_train: ", X_train.shape)
print(f"Final shape of X_test: ", X_test.shape)

X_train.to_csv('X_train_raw.csv', index=True)
X_test.to_csv('X_test_raw.csv', index=True)


# save where nan y values are dropped
X_train_non_nan = X_train.dropna(subset=['y'])
X_train_non_nan.to_csv('X_train_non_nan.csv', index=True)




Processing location A...
Shape of X_train_observed before dropping in-between hour rows:  (118669, 45)
HEIHEI: X_train_observed gaps in dates:  0
HEIHEI: X_train_observed first gap in dates:  DatetimeIndex([], dtype='datetime64[us]', name='ds', freq=None)
HEIHEI: X_train_observed list of size (in days) of each gap:  []
HEIHEI: X_train_observed gaps in dates after filling missing dates:  0
Shape of X_train_observed after dropping in-between hour rows:  (29668, 45)
Shape of X_train_estimated before dropping in-between hour rows:  (17576, 46)
HEIHEI: X_train_estimated gaps in dates:  1
HEIHEI: X_train_estimated first gap in dates:  DatetimeIndex(['2023-01-27'], dtype='datetime64[us]', name='ds', freq=None)
HEIHEI: X_train_estimated list of size (in days) of each gap:  [1.01041667]
HEIHEI: X_train_estimated gaps in dates after filling missing dates:  0
Shape of X_train_estimated after dropping in-between hour rows:  (4418, 46)
Shape of X_test before dropping in-between hour rows:  (2880

In [24]:
import pandas as pd

df = X_train_non_nan.copy()
test_df = X_test.copy()


# add sin and cos of sun_elevation:d and sun_azimuth:d
df['sin_sun_elevation'] = np.sin(np.deg2rad(df['sun_elevation:d']))

test_df['sin_sun_elevation'] = np.sin(np.deg2rad(test_df['sun_elevation:d']))

# add global_rad_1h:J = diffuse_rad_1h:J + direct_rad_1h:J
df['global_rad_1h:J'] = df['diffuse_rad_1h:J'] + df['direct_rad_1h:J']
test_df['global_rad_1h:J'] = test_df['diffuse_rad_1h:J'] + test_df['direct_rad_1h:J']

# dew_or_rime:idx, Change this to one variable for is_dew and one variable for is_rime (dew:1, rime:-1)
df['is_dew'] = df['dew_or_rime:idx'].apply(lambda x: 1 if x == 1 else 0)
df['is_rime'] = df['dew_or_rime:idx'].apply(lambda x: 1 if x == -1 else 0)

test_df['is_dew'] = test_df['dew_or_rime:idx'].apply(lambda x: 1 if x == 1 else 0)
test_df['is_rime'] = test_df['dew_or_rime:idx'].apply(lambda x: 1 if x == -1 else 0)


EXOGENOUS = [
    'estimated_diff_hours',
    "absolute_humidity_2m:gm3",
    "air_density_2m:kgm3",
    "dew_point_2m:K",
    "diffuse_rad_1h:J",
    "direct_rad_1h:J",
    "effective_cloud_cover:p",
    "fresh_snow_1h:cm",
    "snow_depth:cm",
    "sun_elevation:d",
    "sun_azimuth:d",
    "t_1000hPa:K",
    "visibility:m",
    "wind_speed_10m:ms",
    "is_dew",
    "is_rime",
    "sin_sun_elevation",
    "global_rad_1h:J",
    ]
#additional_features_for_testing = 

df = df[EXOGENOUS + ["y", "location"]]
test_df = test_df[EXOGENOUS+ ["location"]]

# save to X_train_feature_engineered.csv
df.to_csv('X_train_feature_engineered.csv', index=True)
test_df.to_csv('X_test_feature_engineered.csv', index=True)



# Starting

In [None]:
# Get the last submission number
last_submission_number = int(max([int(filename.split('_')[1].split('.')[0]) for filename in os.listdir('submissions') if "submission" in filename]))
print("Last submission number:", last_submission_number)

# Create the new filename
new_filename = f'submission_{last_submission_number + 1}'

In [25]:
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset('X_train_feature_engineered.csv')
test_data = TabularDataset('X_test_feature_engineered.csv')
label = 'y'
metric = 'mean_absolute_error'
time_limit = 60*60

predictors = [None, None, None]

loc = "A"
print(f"Training model for location {loc}...")
predictor = TabularPredictor(label=label, eval_metric=metric, path=f"AutogluonModels/{new_filename}_{loc}").fit(train_data[train_data["location"] == loc], time_limit=time_limit)
predictors[0] = predictor

Loaded data from: X_train_feature_engineered.csv | Columns = 21 / 21 | Rows = 93024 -> 93024
Loaded data from: X_test_feature_engineered.csv | Columns = 20 / 20 | Rows = 4608 -> 4608
No path specified. Models will be saved in: "AutogluonModels/ag-20231005_111152"
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20231005_111152"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000
Disk Space Avail:   24.20 GB / 494.38 GB (4.9%)
Train Data Rows:    34085
Train Data Columns: 20
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 630.59471, 1165.90242)
	If 'regression' is not the correct pro

Training model for location A...


	-218.6492	 = Validation score   (-mean_absolute_error)
	0.02s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: KNeighborsDist ... Training model for up to 3599.76s of the 3599.76s of remaining time.
	-176.0068	 = Validation score   (-mean_absolute_error)
	0.02s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: LightGBMXT ... Training model for up to 3599.68s of the 3599.68s of remaining time.


[1000]	valid_set's l1: 179.324
[2000]	valid_set's l1: 171.801
[3000]	valid_set's l1: 168.546
[4000]	valid_set's l1: 166.446
[5000]	valid_set's l1: 164.636
[6000]	valid_set's l1: 163.417
[7000]	valid_set's l1: 162.131
[8000]	valid_set's l1: 161.328
[9000]	valid_set's l1: 160.881
[10000]	valid_set's l1: 160.377


	-160.3755	 = Validation score   (-mean_absolute_error)
	128.34s	 = Training   runtime
	0.37s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3470.62s of the 3470.62s of remaining time.


[1000]	valid_set's l1: 174.814
[2000]	valid_set's l1: 170.872
[3000]	valid_set's l1: 169.441
[4000]	valid_set's l1: 168.934
[5000]	valid_set's l1: 168.648
[6000]	valid_set's l1: 168.338
[7000]	valid_set's l1: 168.111
[8000]	valid_set's l1: 168.002
[9000]	valid_set's l1: 167.867
[10000]	valid_set's l1: 167.753


	-167.7487	 = Validation score   (-mean_absolute_error)
	43.28s	 = Training   runtime
	0.41s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 3426.59s of the 3426.58s of remaining time.
	-189.013	 = Validation score   (-mean_absolute_error)
	13.96s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3411.83s of the 3411.83s of remaining time.
	-178.1038	 = Validation score   (-mean_absolute_error)
	106.9s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 3304.92s of the 3304.92s of remaining time.
	-191.0823	 = Validation score   (-mean_absolute_error)
	2.73s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 3301.89s of the 3301.89s of remaining time.
	-199.1758	 = Validation score   (-mean_absolute_error)
	43.22s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoo

In [None]:

loc = "B"
print(f"Training model for location {loc}...")
predictor = TabularPredictor(label=label, eval_metric=metric, path=f"AutogluonModels/{new_filename}_{loc}").fit(train_data[train_data["location"] == loc], time_limit=time_limit)
predictors[1] = predictor

No path specified. Models will be saved in: "AutogluonModels/ag-20231005_103118"
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20231005_103118"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000
Disk Space Avail:   25.99 GB / 494.38 GB (5.3%)
Train Data Rows:    32844
Train Data Columns: 48
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, -0.0, 96.82478, 193.94649)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Genera

Training model for location B...


	Train Data (Processed) Memory Usage: 12.94 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.19s ...
AutoGluon will gauge predictive performance using evaluation metric: 'mean_absolute_error'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.07611740348313238, Train Rows: 30344, Val Rows: 2500
User-specified model hyperparameters to be fit:
{
	'NN_TORCH': {},
	'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
	'CAT': {},
	'XGB': {},
	'FASTAI': {},
	'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args':

[1000]	valid_set's l1: 28.2905
[2000]	valid_set's l1: 26.315
[3000]	valid_set's l1: 25.2222
[4000]	valid_set's l1: 24.5407
[5000]	valid_set's l1: 24.0041
[6000]	valid_set's l1: 23.601
[7000]	valid_set's l1: 23.3111
[8000]	valid_set's l1: 23.0534
[9000]	valid_set's l1: 22.8287
[10000]	valid_set's l1: 22.6604


	-22.6588	 = Validation score   (-mean_absolute_error)
	43.16s	 = Training   runtime
	0.34s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3555.69s of the 3555.69s of remaining time.


[1000]	valid_set's l1: 25.4515
[2000]	valid_set's l1: 24.2397
[3000]	valid_set's l1: 23.5963
[4000]	valid_set's l1: 23.2499
[5000]	valid_set's l1: 23.1186
[6000]	valid_set's l1: 23.0154
[7000]	valid_set's l1: 22.9528
[8000]	valid_set's l1: 22.9163
[9000]	valid_set's l1: 22.8923
[10000]	valid_set's l1: 22.8727


	-22.8726	 = Validation score   (-mean_absolute_error)
	66.22s	 = Training   runtime
	0.38s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 3488.74s of the 3488.74s of remaining time.
	-26.9273	 = Validation score   (-mean_absolute_error)
	24.64s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3463.61s of the 3463.61s of remaining time.
	-24.5292	 = Validation score   (-mean_absolute_error)
	113.13s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 3350.46s of the 3350.46s of remaining time.
	-28.4244	 = Validation score   (-mean_absolute_error)
	4.37s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 3345.5s of the 3345.5s of remaining time.
	-29.732	 = Validation score   (-mean_absolute_error)
	42.23s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...

[1000]	valid_set's l1: 22.5161
[2000]	valid_set's l1: 21.8756
[3000]	valid_set's l1: 21.7026
[4000]	valid_set's l1: 21.6445
[5000]	valid_set's l1: 21.6268
[6000]	valid_set's l1: 21.617
[7000]	valid_set's l1: 21.6136
[8000]	valid_set's l1: 21.6118
[9000]	valid_set's l1: 21.6108
[10000]	valid_set's l1: 21.6102


	-21.6102	 = Validation score   (-mean_absolute_error)
	161.72s	 = Training   runtime
	1.17s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 2709.76s of remaining time.
	-20.1984	 = Validation score   (-mean_absolute_error)
	0.16s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 890.44s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231005_103118")


In [None]:
loc = "C"
print(f"Training model for location {loc}...")
predictor = TabularPredictor(label=label, eval_metric=metric, path=f"AutogluonModels/{new_filename}_{loc}").fit(train_data[train_data["location"] == loc], time_limit=time_limit)
predictors[2] = predictor

No path specified. Models will be saved in: "AutogluonModels/ag-20231005_104608"
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20231005_104608"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000
Disk Space Avail:   24.99 GB / 494.38 GB (5.1%)
Train Data Rows:    26095
Train Data Columns: 48
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, -0.0, 77.63106, 165.81688)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Ge

Training model for location C...


	Useless Original Features (Count: 1): ['location']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all rows.
		These features do not need to be present at inference time.
	Unused Original Features (Count: 1): ['snow_drift:idx']
		These features were not used to generate any of the output features. Add a feature generator compatible with these features to utilize them.
		Features can also be unused if they carry very little information, such as being categorical but having almost entirely unique values or being duplicates of other features.
		These features do not need to be present at inference time.
		('float', []) : 1 | ['snow_drift:idx']
	Types of features in original data (raw dtype, special dtypes):
		('float', [])                      : 45 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('object', ['datetime_

[1000]	valid_set's l1: 16.5623
[2000]	valid_set's l1: 15.9561
[3000]	valid_set's l1: 15.6743
[4000]	valid_set's l1: 15.5235
[5000]	valid_set's l1: 15.4624
[6000]	valid_set's l1: 15.4409
[7000]	valid_set's l1: 15.4113
[8000]	valid_set's l1: 15.3992
[9000]	valid_set's l1: 15.3877
[10000]	valid_set's l1: 15.3767


	-15.375	 = Validation score   (-mean_absolute_error)
	38.96s	 = Training   runtime
	0.37s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3559.75s of the 3559.75s of remaining time.


[1000]	valid_set's l1: 16.8096
[2000]	valid_set's l1: 16.3727
[3000]	valid_set's l1: 16.2273
[4000]	valid_set's l1: 16.1867
[5000]	valid_set's l1: 16.1669
[6000]	valid_set's l1: 16.1593
[7000]	valid_set's l1: 16.1505
[8000]	valid_set's l1: 16.1433
[9000]	valid_set's l1: 16.1437
[10000]	valid_set's l1: 16.1435


	-16.1424	 = Validation score   (-mean_absolute_error)
	64.69s	 = Training   runtime
	0.35s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 3494.41s of the 3494.41s of remaining time.
	-18.8102	 = Validation score   (-mean_absolute_error)
	15.91s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3478.13s of the 3478.13s of remaining time.
	-16.9078	 = Validation score   (-mean_absolute_error)
	107.43s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 3370.67s of the 3370.67s of remaining time.
	-19.0049	 = Validation score   (-mean_absolute_error)
	2.37s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 3367.92s of the 3367.92s of remaining time.
	-18.5567	 = Validation score   (-mean_absolute_error)
	28.28s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost 

[1000]	valid_set's l1: 16.1692
[2000]	valid_set's l1: 16.0585
[3000]	valid_set's l1: 16.0355
[4000]	valid_set's l1: 16.0285
[5000]	valid_set's l1: 16.0261
[6000]	valid_set's l1: 16.0251
[7000]	valid_set's l1: 16.0249
[8000]	valid_set's l1: 16.0248
[9000]	valid_set's l1: 16.0247
[10000]	valid_set's l1: 16.0247


	-16.0247	 = Validation score   (-mean_absolute_error)
	154.53s	 = Training   runtime
	1.05s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 2959.71s of remaining time.
	-14.7569	 = Validation score   (-mean_absolute_error)
	0.15s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 640.47s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231005_104608")


# Submit

In [None]:
# pull in test.csv and create submission.csv
submission_ids_df = pd.read_csv('test.csv', index_col=0)
# submission_ids_df has Id, Time, Location
# X_test has location_A, location_B, location_C
# we have to make sure that submission_ids_df has same dates as X_test, then predict y for each row in submission_ids_df, then save as submission.csv (using correct Id from submission_ids_df)

# convert index to datetime
submission_ids_df["id"] = submission_ids_df.index
submission_ids_df.set_index('time', inplace=True)
submission_ids_df.index = pd.to_datetime(submission_ids_df.index).astype('datetime64[ns]')
submission_ids_df

# split submission_ids_df into each location
print(submission_ids_df.groupby('location').count())


          prediction   id
location                 
A                720  720
B                720  720
C                720  720


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Assuming X_test and model are already defined
# Assuming 'time' is also a datetime64[ns] column in X_test


test_data = TabularDataset('X_test_raw.csv')
test_ids = TabularDataset('test.csv')
# merge test_data with test_ids
test_data = pd.merge(test_data, test_ids, how="inner", right_on=["time", "location"], left_on=["ds", "location"])

test_data

Loaded data from: X_test_raw.csv | Columns = 48 / 48 | Rows = 4608 -> 4608
Loaded data from: test.csv | Columns = 4 / 4 | Rows = 2160 -> 2160


Unnamed: 0,ds,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,...,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,estimated_diff_hours,location,id,time,prediction
0,2023-05-01 00:00:00,4.4,1.286,912.700012,0.000000,0.000000,1041.199951,0.0,271.700012,0.000000,...,30210.699219,4.0,2.2,3.4,-0.0,16.998889,A,0,2023-05-01 00:00:00,0
1,2023-05-01 01:00:00,4.3,1.287,,0.000000,0.000000,1095.400024,0.0,271.600006,0.000000,...,29507.500000,3.9,2.0,3.3,-0.0,17.998889,A,1,2023-05-01 01:00:00,0
2,2023-05-01 02:00:00,4.2,1.284,1482.099976,0.000000,0.000000,1041.300049,0.0,271.200012,0.000000,...,29463.099609,3.7,1.8,3.2,-0.0,18.998889,A,2,2023-05-01 02:00:00,0
3,2023-05-01 03:00:00,4.1,1.282,2306.699951,0.000000,0.000000,1465.599976,0.0,270.799988,0.000000,...,33727.101562,3.6,1.6,3.2,-0.0,19.998889,A,3,2023-05-01 03:00:00,0
4,2023-05-01 04:00:00,3.9,1.282,2323.199951,59774.500000,43.000000,703.599976,0.0,270.299988,31.400000,...,35927.601562,3.4,1.3,3.1,-0.0,20.998889,A,4,2023-05-01 04:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,2023-07-03 19:00:00,8.3,1.196,3532.399902,615338.812500,117.199997,2239.600098,0.0,281.600006,41.299999,...,41536.398438,2.2,1.9,-1.2,0.0,35.991389,C,2155,2023-07-03 19:00:00,0
2156,2023-07-03 20:00:00,8.5,1.199,3429.000000,269582.406250,40.000000,1513.699951,0.0,281.899994,19.700001,...,40136.500000,2.1,1.9,-0.9,0.0,36.991389,C,2156,2023-07-03 20:00:00,0
2157,2023-07-03 21:00:00,8.8,1.202,2495.000000,71999.601562,4.900000,1342.500000,0.0,282.200012,5.000000,...,43266.101562,2.4,2.2,-1.0,0.0,37.991389,C,2157,2023-07-03 21:00:00,0
2158,2023-07-03 22:00:00,9.0,1.206,1997.400024,1378.300049,0.000000,1878.900024,0.0,282.600006,0.000000,...,39017.898438,2.0,1.8,-0.8,0.0,38.991389,C,2158,2023-07-03 22:00:00,0


In [None]:
# predict, grouped by location
predictions = []
location_map = {
    "A": 0,
    "B": 1,
    "C": 2
}
for loc, group in test_data.groupby('location'):
    predictions.append(predictors[location_map[loc]].predict(test_data[test_data["location"] == loc]))

In [None]:
# concatenate predictions
predictions = pd.concat(predictions)
predictions

0         2.006019
1         5.289860
2         7.965479
3        43.964508
4       372.254974
           ...    
2155     77.931915
2156     51.792599
2157     31.644035
2158     12.108985
2159     11.182805
Name: y, Length: 2160, dtype: float32

In [None]:
# save predictions series to csv as "id", "prediction"
predictions_df = pd.DataFrame(predictions)
predictions_df.rename(columns={"y": "prediction"}, inplace=True)
predictions_df["id"] = predictions_df.index
predictions_df


Unnamed: 0,prediction,id
0,2.006019,0
1,5.289860,1
2,7.965479,2
3,43.964508,3
4,372.254974,4
...,...,...
2155,77.931915,2155
2156,51.792599,2156
2157,31.644035,2157
2158,12.108985,2158


In [None]:



# Save the submission DataFrame to submissions folder, create new name based on last submission, format is submission_<last_submission_number + 1>.csv

# Save the submission
predictions_df.to_csv(os.path.join('submissions', f"{new_filename}.csv"), index=False)

Last submission number: 62
