In [1]:
import pandas as pd
from darts import TimeSeries
import numpy as np



import warnings
warnings.filterwarnings("ignore")

def fix_datetime(X, name):
    """
    Function to fix and standardize datetime in the given DataFrame.
    
    Parameters:
    - X: DataFrame to be modified.
    - name: String representing the name of the DataFrame, used for logging.
    
    Returns:
    - Modified DataFrame with standardized datetime.
    """

    # Convert 'date_forecast' to datetime format and replace original column with 'ds'
    X['ds'] = pd.to_datetime(X['date_forecast'])
    X.drop(columns=['date_forecast'], inplace=True, errors='ignore')

    # Sort DataFrame by the new datetime column ('ds') and set it as the index
    X.sort_values(by='ds', inplace=True)
    X.set_index('ds', inplace=True)

    # Log the shape of the DataFrame before dropping rows with in-between minutes
    print(f"Shape of {name} before dropping in-between hour rows: ", X.shape)

    # Identify and log gaps in the date sequence
    print(f"HEIHEI: {name} gaps in dates: ", X.index.to_series().diff().dt.total_seconds().gt(60*15).sum())
    print(f"HEIHEI: {name} first gap in dates: ", X[X.index.to_series().diff().dt.total_seconds().gt(60*15)==True].index[:1])

    # Calculate and log the size of each gap in the date sequence
    temp = X.index.to_series().diff().dt.total_seconds()
    if temp.shape[0] > 0:
        print(f"HEIHEI: {name} list of size (in days) of each gap: ", temp[temp.gt(60*15)].values / (60*60*24))
    
    # temporarily transform into darts time series to fill missing dates
    # get date_calc if date_calc is column in X
    temp_calc = None
    if "date_calc" in X.columns:
        temp_calc = X["date_calc"]
        X.drop(columns=['date_calc'], inplace=True)
    X = TimeSeries.from_dataframe(df=X, freq="15T", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    if temp_calc is not None:
        X["date_calc"] = temp_calc

    print(f"HEIHEI: {name} gaps in dates after filling missing dates: ", X.index.to_series().diff().dt.total_seconds().gt(60*15).sum())


    # Drop rows where the minute part of the time is not 0
    X = X[X.index.minute == 0]

    # Log the shape of the DataFrame after dropping rows with in-between minutes
    print(f"Shape of {name} after dropping in-between hour rows: ", X.shape)

    return X



def convert_to_datetime(X_train_observed, X_train_estimated, X_test, y_train):
    X_train_observed = fix_datetime(X_train_observed, "X_train_observed")
    X_train_estimated = fix_datetime(X_train_estimated, "X_train_estimated")
    X_test = fix_datetime(X_test, "X_test")


    X_train_observed["estimated_diff_hours"] = 0
    X_train_estimated["estimated_diff_hours"] = (X_train_estimated.index - pd.to_datetime(X_train_estimated["date_calc"])).dt.total_seconds() / 3600.0
    X_test["estimated_diff_hours"] = (X_test.index - pd.to_datetime(X_test["date_calc"])).dt.total_seconds() / 3600.0

    X_train_estimated.drop(columns=['date_calc'], inplace=True)
    X_test.drop(columns=['date_calc'], inplace=True)

    y_train['ds'] = pd.to_datetime(y_train['time'])
    y_train.drop(columns=['time'], inplace=True)
    y_train.sort_values(by='ds', inplace=True)
    y_train.set_index('ds', inplace=True)

    return X_train_observed, X_train_estimated, X_test, y_train




# location_map = {
#     "A": 0,
#     "B": 1,
#     "C": 2
# }


def preprocess_data(X_train_observed, X_train_estimated, X_test, y_train, location):
    # convert to datetime
    X_train_observed, X_train_estimated, X_test, y_train = convert_to_datetime(X_train_observed, X_train_estimated, X_test, y_train)


    # # cast all columns to float64
    # X_train = X_train.astype('float64')
    # X_test = X_test.astype('float64')


    print(f"X_train_observed shape: {X_train_observed.shape}")
    print(f"X_train_estimated shape: {X_train_estimated.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")



    y_train["y"] = y_train["pv_measurement"].astype('float64')
    y_train.drop(columns=['pv_measurement'], inplace=True)
    print("y_train columns: ", y_train.columns)

    # temporarily transform into darts time series to fill missing dates
    print("Shape of y_train before filling missing dates: ", y_train.shape)
    y_train = TimeSeries.from_dataframe(df=y_train, freq="H", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    print("Shape of y_train after filling missing dates: ", y_train.shape)


    # number of gaps in X_train_observed + X_train_estimated before
    print(f"LOOK: Number of gaps in X_train_observed plus number of gaps in X_train_estimated before: ", X_train_observed.index.to_series().diff().dt.total_seconds().gt(3600).sum() + X_train_estimated.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    X_train = pd.concat([X_train_observed, X_train_estimated])
    print(f"LOOK: Number of gaps in X_train_observed plus number of gaps in X_train_estimated after: ", X_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    # print size of gaps in X_train
    temp = X_train.index.to_series().diff().dt.total_seconds()
    if temp.shape[0] > 0:
        print("LOOK: list of size (in days) of each gap: ", temp[temp.gt(3600)].values / (60*60*24))
    print("if the number is bigger after than before that means there is a gap in time between the observed and estimated training sets")

    # print info on dates in X_train, and if there are any missing dates
    print("X_train dates info: ", X_train.index.min(), X_train.index.max(), X_train.index.max() - X_train.index.min())
    print("X_test dates info: ", X_test.index.min(), X_test.index.max(), X_test.index.max() - X_test.index.min())
    print("y_train dates info: ", y_train.index.min(), y_train.index.max(), y_train.index.max() - y_train.index.min())

    # any gaps in dates?
    print("X_train gaps in dates: ", X_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    print("X_test gaps in dates: ", X_test.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    print("y_train gaps in dates: ", y_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())

    # temporarily transform into darts time series to fill missing dates
    X_train = TimeSeries.from_dataframe(df=X_train, freq="H", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    X_test = TimeSeries.from_dataframe(df=X_test, freq="H", fill_missing_dates=True, fillna_value=None).pd_dataframe()
    print("X_train gaps in dates after filling missing dates: ", X_train.index.to_series().diff().dt.total_seconds().gt(3600).sum())
    print("X_test gaps in dates after filling missing dates: ", X_test.index.to_series().diff().dt.total_seconds().gt(3600).sum())

    

    # clip all y values to 0 if negative
    y_train["y"] = y_train["y"].clip(lower=0)
    
    # print Number of missing values in X train
    print("Number of missing values in X_train: ", X_train.isnull().sum().sum())
    print("Number of missing values in X_test: ", X_test.isnull().sum().sum())
    # y_train missing values
    print("Number of missing values in y_train: ", y_train.isnull().sum().sum())
    X_train = pd.merge(X_train, y_train, how="outer", left_index=True, right_index=True)
    print("Number of missing values in X_train after merging with y_train: ", X_train.drop(columns=['y']).isnull().sum().sum())



    X_train["location"] = location
    X_test["location"] = location
    
    return X_train, X_test
    


# Define locations
locations = ['A', 'B', 'C']

X_trains = []
X_tests = []
y_trains = []
# Loop through locations
for loc in locations:
    print("\n\n")
    print(f"Processing location {loc}...")
    # Read target training data
    y_train = pd.read_parquet(f'{loc}/train_targets.parquet')
    
    # Read estimated training data and add location feature
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    
    # Read observed training data and add location feature
    X_train_observed= pd.read_parquet(f'{loc}/X_train_observed.parquet')

    # Read estimated test data and add location feature
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')
    
    # Concatenate observed and estimated datasets for each location
    #X_train = pd.concat([X_train_estimated, X_train_observed])
    



    # Preprocess data
    X_train, X_test = preprocess_data(X_train_observed, X_train_estimated, X_test_estimated, y_train, loc)

    print(f"Final shape of X_train for location {loc}: ", X_train.shape)
    print(f"Final shape of X_test for location {loc}: ", X_test.shape)

    # print(y_train.head(), y_train.shape)
    # print(X_train.head(), X_train.shape)
    # print(X_train.head(), X_train.shape)
    # print(type(X_train['y']))

    # Save data to csv
    X_train.to_csv(f'{loc}/X_train.csv', index=True)
    X_test.to_csv(f'{loc}/X_test.csv', index=True)


    X_trains.append(X_train)
    X_tests.append(X_test)

# Concatenate all data and save to csv
X_train = pd.concat(X_trains)
X_test = pd.concat(X_tests)

print(f"Final shape of X_train: ", X_train.shape)
print(f"Final shape of X_test: ", X_test.shape)

X_train.to_csv('X_train_raw.csv', index=True)
X_test.to_csv('X_test_raw.csv', index=True)


# save where nan y values are dropped
X_train_non_nan = X_train.dropna(subset=['y'])
X_train_non_nan.to_csv('X_train_non_nan.csv', index=True)




Processing location A...
Shape of X_train_observed before dropping in-between hour rows:  (118669, 45)
HEIHEI: X_train_observed gaps in dates:  0
HEIHEI: X_train_observed first gap in dates:  DatetimeIndex([], dtype='datetime64[us]', name='ds', freq=None)
HEIHEI: X_train_observed list of size (in days) of each gap:  []
HEIHEI: X_train_observed gaps in dates after filling missing dates:  0
Shape of X_train_observed after dropping in-between hour rows:  (29668, 45)
Shape of X_train_estimated before dropping in-between hour rows:  (17576, 46)
HEIHEI: X_train_estimated gaps in dates:  1
HEIHEI: X_train_estimated first gap in dates:  DatetimeIndex(['2023-01-27'], dtype='datetime64[us]', name='ds', freq=None)
HEIHEI: X_train_estimated list of size (in days) of each gap:  [1.01041667]
HEIHEI: X_train_estimated gaps in dates after filling missing dates:  0
Shape of X_train_estimated after dropping in-between hour rows:  (4418, 46)
Shape of X_test before dropping in-between hour rows:  (2880

In [2]:
import pandas as pd

df = X_train_non_nan.copy()
test_df = X_test.copy()


# add sin and cos of sun_elevation:d and sun_azimuth:d
df['sin_sun_elevation'] = np.sin(np.deg2rad(df['sun_elevation:d']))

test_df['sin_sun_elevation'] = np.sin(np.deg2rad(test_df['sun_elevation:d']))

# add global_rad_1h:J = diffuse_rad_1h:J + direct_rad_1h:J
df['global_rad_1h:J'] = df['diffuse_rad_1h:J'] + df['direct_rad_1h:J']
test_df['global_rad_1h:J'] = test_df['diffuse_rad_1h:J'] + test_df['direct_rad_1h:J']

# dew_or_rime:idx, Change this to one variable for is_dew and one variable for is_rime (dew:1, rime:-1)
df['is_dew'] = df['dew_or_rime:idx'].apply(lambda x: 1 if x == 1 else 0)
df['is_rime'] = df['dew_or_rime:idx'].apply(lambda x: 1 if x == -1 else 0)

test_df['is_dew'] = test_df['dew_or_rime:idx'].apply(lambda x: 1 if x == 1 else 0)
test_df['is_rime'] = test_df['dew_or_rime:idx'].apply(lambda x: 1 if x == -1 else 0)


EXOGENOUS = [
    'estimated_diff_hours',
    "absolute_humidity_2m:gm3",
    "air_density_2m:kgm3",
    "dew_point_2m:K",
    "diffuse_rad_1h:J",
    "direct_rad_1h:J",
    "effective_cloud_cover:p",
    "fresh_snow_1h:cm",
    "snow_depth:cm",
    "sun_elevation:d",
    "sun_azimuth:d",
    "t_1000hPa:K",
    "visibility:m",
    "wind_speed_10m:ms",
    "is_dew",
    "is_rime",
    "sin_sun_elevation",
    "global_rad_1h:J",
    ]
#additional_features_for_testing = 

df = df[EXOGENOUS + ["y", "location"]]
test_df = test_df[EXOGENOUS+ ["location"]]

# save to X_train_feature_engineered.csv
df.to_csv('X_train_feature_engineered.csv', index=True)
test_df.to_csv('X_test_feature_engineered.csv', index=True)



# Starting

In [4]:
import os
# Get the last submission number
last_submission_number = int(max([int(filename.split('_')[1].split('.')[0]) for filename in os.listdir('submissions') if "submission" in filename]))
print("Last submission number:", last_submission_number)
print("Now creating submission number:", last_submission_number + 1)

# Create the new filename
new_filename = f'submission_{last_submission_number + 1}'

Last submission number: 63


In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset('X_train_feature_engineered.csv')
test_data = TabularDataset('X_test_feature_engineered.csv')
label = 'y'
metric = 'mean_absolute_error'
time_limit = 60*60

predictors = [None, None, None]

loc = "A"
print(f"Training model for location {loc}...")
predictor = TabularPredictor(label=label, eval_metric=metric, path=f"AutogluonModels/{new_filename}_{loc}").fit(train_data[train_data["location"] == loc], time_limit=time_limit)
predictors[0] = predictor

Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/submission_64_A"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000
Disk Space Avail:   23.13 GB / 494.38 GB (4.7%)
Train Data Rows:    34085
Train Data Columns: 20
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 630.59471, 1165.90242)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Availab

Training model for location A...


	-218.6492	 = Validation score   (-mean_absolute_error)
	0.97s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: KNeighborsDist ... Training model for up to 3598.8s of the 3598.8s of remaining time.
	-176.0068	 = Validation score   (-mean_absolute_error)
	0.02s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: LightGBMXT ... Training model for up to 3598.69s of the 3598.69s of remaining time.


[1000]	valid_set's l1: 179.324
[2000]	valid_set's l1: 171.801
[3000]	valid_set's l1: 168.546
[4000]	valid_set's l1: 166.446
[5000]	valid_set's l1: 164.636
[6000]	valid_set's l1: 163.417
[7000]	valid_set's l1: 162.131
[8000]	valid_set's l1: 161.328
[9000]	valid_set's l1: 160.881
[10000]	valid_set's l1: 160.377


	-160.3755	 = Validation score   (-mean_absolute_error)
	42.14s	 = Training   runtime
	0.32s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3555.88s of the 3555.88s of remaining time.


[1000]	valid_set's l1: 174.814
[2000]	valid_set's l1: 170.872
[3000]	valid_set's l1: 169.441
[4000]	valid_set's l1: 168.934
[5000]	valid_set's l1: 168.648
[6000]	valid_set's l1: 168.338
[7000]	valid_set's l1: 168.111
[8000]	valid_set's l1: 168.002
[9000]	valid_set's l1: 167.867
[10000]	valid_set's l1: 167.753


	-167.7487	 = Validation score   (-mean_absolute_error)
	43.52s	 = Training   runtime
	0.44s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 3511.58s of the 3511.58s of remaining time.
	-189.013	 = Validation score   (-mean_absolute_error)
	14.79s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3496.29s of the 3496.29s of remaining time.
	-178.1038	 = Validation score   (-mean_absolute_error)
	102.11s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 3394.15s of the 3394.15s of remaining time.
	-191.0823	 = Validation score   (-mean_absolute_error)
	2.96s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 3390.56s of the 3390.56s of remaining time.
	-199.1758	 = Validation score   (-mean_absolute_error)
	46.7s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoo

[1000]	valid_set's l1: 166.066
[2000]	valid_set's l1: 163.316
[3000]	valid_set's l1: 162.862
[4000]	valid_set's l1: 162.625
[5000]	valid_set's l1: 162.473
[6000]	valid_set's l1: 162.437
[7000]	valid_set's l1: 162.409
[8000]	valid_set's l1: 162.395
[9000]	valid_set's l1: 162.394
[10000]	valid_set's l1: 162.393


	-162.393	 = Validation score   (-mean_absolute_error)
	136.25s	 = Training   runtime
	0.97s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 2919.26s of remaining time.
	-150.9047	 = Validation score   (-mean_absolute_error)
	0.15s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 680.92s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/submission_64_A")


In [6]:

loc = "B"
print(f"Training model for location {loc}...")
predictor = TabularPredictor(label=label, eval_metric=metric, path=f"AutogluonModels/{new_filename}_{loc}").fit(train_data[train_data["location"] == loc], time_limit=time_limit)
predictors[1] = predictor

Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/submission_64_B"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000
Disk Space Avail:   21.88 GB / 494.38 GB (4.4%)
Train Data Rows:    32844
Train Data Columns: 20
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).


	Label info (max, min, mean, stddev): (1152.3, -0.0, 96.82478, 193.94649)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6311.48 MB
	Train Data (Original)  Memory Usage: 9.13 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting DatetimeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Ge

Training model for location B...


	-27.5392	 = Validation score   (-mean_absolute_error)
	0.02s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: KNeighborsDist ... Training model for up to 3599.78s of the 3599.78s of remaining time.
	-22.1104	 = Validation score   (-mean_absolute_error)
	0.02s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: LightGBMXT ... Training model for up to 3599.7s of the 3599.7s of remaining time.


[1000]	valid_set's l1: 29.4902
[2000]	valid_set's l1: 27.2899
[3000]	valid_set's l1: 26.1052
[4000]	valid_set's l1: 25.1608
[5000]	valid_set's l1: 24.5642
[6000]	valid_set's l1: 24.0926
[7000]	valid_set's l1: 23.75
[8000]	valid_set's l1: 23.4855
[9000]	valid_set's l1: 23.2792
[10000]	valid_set's l1: 23.1076


	-23.1072	 = Validation score   (-mean_absolute_error)
	36.7s	 = Training   runtime
	0.33s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3562.29s of the 3562.29s of remaining time.


[1000]	valid_set's l1: 25.2403
[2000]	valid_set's l1: 24.037
[3000]	valid_set's l1: 23.5399
[4000]	valid_set's l1: 23.2674
[5000]	valid_set's l1: 23.1749
[6000]	valid_set's l1: 23.0914
[7000]	valid_set's l1: 23.0135
[8000]	valid_set's l1: 22.9799
[9000]	valid_set's l1: 22.9526
[10000]	valid_set's l1: 22.9348


	-22.9348	 = Validation score   (-mean_absolute_error)
	40.64s	 = Training   runtime
	0.43s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 3520.8s of the 3520.79s of remaining time.
	-27.0486	 = Validation score   (-mean_absolute_error)
	15.52s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3504.74s of the 3504.74s of remaining time.
	-24.0731	 = Validation score   (-mean_absolute_error)
	102.09s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 3402.62s of the 3402.62s of remaining time.
	-28.6908	 = Validation score   (-mean_absolute_error)
	2.81s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 3399.23s of the 3399.23s of remaining time.
	-30.6298	 = Validation score   (-mean_absolute_error)
	43.78s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost .

[1000]	valid_set's l1: 22.4058
[2000]	valid_set's l1: 21.6491
[3000]	valid_set's l1: 21.4479
[4000]	valid_set's l1: 21.3798
[5000]	valid_set's l1: 21.3509
[6000]	valid_set's l1: 21.3361
[7000]	valid_set's l1: 21.3288
[8000]	valid_set's l1: 21.3247
[9000]	valid_set's l1: 21.3233
[10000]	valid_set's l1: 21.3226


	-21.3226	 = Validation score   (-mean_absolute_error)
	135.73s	 = Training   runtime
	0.91s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 2717.18s of remaining time.
	-20.1221	 = Validation score   (-mean_absolute_error)
	0.15s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 883.0s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/submission_64_B")


In [7]:
loc = "C"
print(f"Training model for location {loc}...")
predictor = TabularPredictor(label=label, eval_metric=metric, path=f"AutogluonModels/{new_filename}_{loc}").fit(train_data[train_data["location"] == loc], time_limit=time_limit)
predictors[2] = predictor

Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/submission_64_C"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000
Disk Space Avail:   20.94 GB / 494.38 GB (4.2%)
Train Data Rows:    26095
Train Data Columns: 20
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, -0.0, 77.63106, 165.81688)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Avail

Training model for location C...


	-25.4907	 = Validation score   (-mean_absolute_error)
	0.01s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: KNeighborsDist ... Training model for up to 3599.82s of the 3599.82s of remaining time.
	-20.1032	 = Validation score   (-mean_absolute_error)
	0.02s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: LightGBMXT ... Training model for up to 3599.75s of the 3599.75s of remaining time.


[1000]	valid_set's l1: 17.6805
[2000]	valid_set's l1: 17.0186
[3000]	valid_set's l1: 16.7036
[4000]	valid_set's l1: 16.5355
[5000]	valid_set's l1: 16.4673
[6000]	valid_set's l1: 16.4088
[7000]	valid_set's l1: 16.3316
[8000]	valid_set's l1: 16.3049
[9000]	valid_set's l1: 16.2906
[10000]	valid_set's l1: 16.2459


	-16.2434	 = Validation score   (-mean_absolute_error)
	37.04s	 = Training   runtime
	0.36s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3561.95s of the 3561.95s of remaining time.


[1000]	valid_set's l1: 17.7442
[2000]	valid_set's l1: 17.4383
[3000]	valid_set's l1: 17.3238
[4000]	valid_set's l1: 17.2773
[5000]	valid_set's l1: 17.256
[6000]	valid_set's l1: 17.2457
[7000]	valid_set's l1: 17.2444
[8000]	valid_set's l1: 17.2481


	-17.2411	 = Validation score   (-mean_absolute_error)
	52.57s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 3508.88s of the 3508.88s of remaining time.
	-19.2543	 = Validation score   (-mean_absolute_error)
	9.41s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3499.02s of the 3499.02s of remaining time.
	-17.4701	 = Validation score   (-mean_absolute_error)
	99.78s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 3399.23s of the 3399.22s of remaining time.
	-19.4452	 = Validation score   (-mean_absolute_error)
	1.8s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 3397.09s of the 3397.09s of remaining time.
	-19.9486	 = Validation score   (-mean_absolute_error)
	28.24s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...

[1000]	valid_set's l1: 17.6079
[2000]	valid_set's l1: 17.5059
[3000]	valid_set's l1: 17.4819
[4000]	valid_set's l1: 17.4752
[5000]	valid_set's l1: 17.4736
[6000]	valid_set's l1: 17.4721
[7000]	valid_set's l1: 17.4719
[8000]	valid_set's l1: 17.4717
[9000]	valid_set's l1: 17.4717
[10000]	valid_set's l1: 17.4717


	-17.4716	 = Validation score   (-mean_absolute_error)
	136.44s	 = Training   runtime
	1.13s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 2977.03s of remaining time.
	-15.3717	 = Validation score   (-mean_absolute_error)
	0.15s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 623.14s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/submission_64_C")


# Submit

In [None]:

test_ids = TabularDataset('test.csv')
# merge test_data with test_ids
test_data = pd.merge(test_data, test_ids, how="inner", right_on=["time", "location"], left_on=["ds", "location"])

test_data

In [60]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming X_test and model are already defined
# Assuming 'time' is also a datetime64[ns] column in X_test


test_data = TabularDataset('X_test_feature_engineered.csv')

Loaded data from: X_test_feature_engineered.csv | Columns = 20 / 20 | Rows = 4608 -> 4608
Loaded data from: test.csv | Columns = 4 / 4 | Rows = 2160 -> 2160


Unnamed: 0,ds,estimated_diff_hours,absolute_humidity_2m:gm3,air_density_2m:kgm3,dew_point_2m:K,diffuse_rad_1h:J,direct_rad_1h:J,effective_cloud_cover:p,fresh_snow_1h:cm,snow_depth:cm,...,visibility:m,wind_speed_10m:ms,is_dew,is_rime,sin_sun_elevation,global_rad_1h:J,location,id,time,prediction
0,2023-05-01 00:00:00,16.998889,4.4,1.286,271.700012,0.000000,0.000000,80.699997,0.0,0.0,...,30210.699219,4.0,0,0,-0.193978,0.000000,A,0,2023-05-01 00:00:00,0
1,2023-05-01 01:00:00,17.998889,4.3,1.287,271.600006,0.000000,0.000000,64.500000,0.0,0.0,...,29507.500000,3.9,0,0,-0.158641,0.000000,A,1,2023-05-01 01:00:00,0
2,2023-05-01 02:00:00,18.998889,4.2,1.284,271.200012,0.000000,0.000000,94.400002,0.0,0.0,...,29463.099609,3.7,0,0,-0.096645,0.000000,A,2,2023-05-01 02:00:00,0
3,2023-05-01 03:00:00,19.998889,4.1,1.282,270.799988,0.000000,0.000000,75.000000,0.0,0.0,...,33727.101562,3.6,0,0,-0.012217,0.000000,A,3,2023-05-01 03:00:00,0
4,2023-05-01 04:00:00,20.998889,3.9,1.282,270.299988,56574.300781,19781.400391,58.599998,0.0,0.0,...,35927.601562,3.4,0,0,0.088842,76355.701172,A,4,2023-05-01 04:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,2023-07-03 19:00:00,35.991389,8.3,1.196,281.600006,199266.906250,57163.699219,89.599998,0.0,0.0,...,41536.398438,2.2,0,0,0.176776,256430.605469,C,2155,2023-07-03 19:00:00,0
2156,2023-07-03 20:00:00,36.991389,8.5,1.199,281.899994,109878.000000,39874.800781,84.400002,0.0,0.0,...,40136.500000,2.1,0,0,0.085556,149752.800781,C,2156,2023-07-03 20:00:00,0
2157,2023-07-03 21:00:00,37.991389,8.8,1.202,282.200012,44498.898438,10678.299805,68.800003,0.0,0.0,...,43266.101562,2.4,0,0,0.012409,55177.198242,C,2157,2023-07-03 21:00:00,0
2158,2023-07-03 22:00:00,38.991389,9.0,1.206,282.600006,8968.599609,0.000000,100.000000,0.0,0.0,...,39017.898438,2.0,0,0,-0.037638,8968.599609,C,2158,2023-07-03 22:00:00,0


In [61]:
test_ids

Unnamed: 0,id,time,prediction,location
0,0,2023-05-01 00:00:00,0,A
1,1,2023-05-01 01:00:00,0,A
2,2,2023-05-01 02:00:00,0,A
3,3,2023-05-01 03:00:00,0,A
4,4,2023-05-01 04:00:00,0,A
...,...,...,...,...
2155,2155,2023-07-03 19:00:00,0,C
2156,2156,2023-07-03 20:00:00,0,C
2157,2157,2023-07-03 21:00:00,0,C
2158,2158,2023-07-03 22:00:00,0,C


In [62]:
(test_data["location"]=="C").sum()
test_data[test_data["location"]=="B"]

Unnamed: 0,ds,estimated_diff_hours,absolute_humidity_2m:gm3,air_density_2m:kgm3,dew_point_2m:K,diffuse_rad_1h:J,direct_rad_1h:J,effective_cloud_cover:p,fresh_snow_1h:cm,snow_depth:cm,...,visibility:m,wind_speed_10m:ms,is_dew,is_rime,sin_sun_elevation,global_rad_1h:J,location,id,time,prediction
720,2023-05-01 00:00:00,16.998889,4.3,1.283,271.700012,0.000000,0.000000,80.699997,0.0,0.0,...,31329.500000,4.0,0,0,-0.193960,0.000000,B,720,2023-05-01 00:00:00,0
721,2023-05-01 01:00:00,17.998889,4.3,1.283,271.600006,0.000000,0.000000,64.599998,0.0,0.0,...,30737.800781,3.9,0,0,-0.158623,0.000000,B,721,2023-05-01 01:00:00,0
722,2023-05-01 02:00:00,18.998889,4.2,1.283,271.200012,0.000000,0.000000,94.300003,0.0,0.0,...,29863.199219,3.7,0,0,-0.096628,0.000000,B,722,2023-05-01 02:00:00,0
723,2023-05-01 03:00:00,19.998889,4.1,1.282,270.799988,0.000000,0.000000,75.099998,0.0,0.0,...,33809.000000,3.6,0,0,-0.012200,0.000000,B,723,2023-05-01 03:00:00,0
724,2023-05-01 04:00:00,20.998889,3.9,1.283,270.299988,56571.500000,19743.400391,58.700001,0.0,0.0,...,35603.398438,3.4,0,0,0.088860,76314.900391,B,724,2023-05-01 04:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,2023-07-03 19:00:00,35.991389,8.3,1.197,281.500000,206778.500000,55664.000000,88.400002,0.0,0.0,...,43766.398438,2.6,0,0,0.177102,262442.500000,B,1435,2023-07-03 19:00:00,0
1436,2023-07-03 20:00:00,36.991389,8.4,1.200,281.700012,114277.000000,39711.699219,81.300003,0.0,0.0,...,43369.000000,2.3,0,0,0.086008,153988.699219,B,1436,2023-07-03 20:00:00,0
1437,2023-07-03 21:00:00,37.991389,8.7,1.203,282.100006,46271.101562,11528.700195,64.500000,0.0,0.0,...,44597.601562,2.7,0,0,0.012967,57799.801758,B,1437,2023-07-03 21:00:00,0
1438,2023-07-03 22:00:00,38.991389,9.0,1.206,282.600006,9413.900391,0.000000,97.000000,0.0,0.0,...,41415.699219,2.4,0,0,-0.037010,9413.900391,B,1438,2023-07-03 22:00:00,0


In [63]:
# predict, grouped by location
predictions = []
location_map = {
    "A": 0,
    "B": 1,
    "C": 2
}
for loc, group in test_data.groupby('location'):
    i = location_map[loc]
    subset = test_data[test_data["location"] == loc].reset_index(drop=True)
    #print(subset)
    pred = predictors[i].predict(subset)
    subset["prediction"] = pred
    predictions.append(subset)


predictions[1]

Unnamed: 0,ds,estimated_diff_hours,absolute_humidity_2m:gm3,air_density_2m:kgm3,dew_point_2m:K,diffuse_rad_1h:J,direct_rad_1h:J,effective_cloud_cover:p,fresh_snow_1h:cm,snow_depth:cm,...,visibility:m,wind_speed_10m:ms,is_dew,is_rime,sin_sun_elevation,global_rad_1h:J,location,id,time,prediction
0,2023-05-01 00:00:00,16.998889,4.3,1.283,271.700012,0.000000,0.000000,80.699997,0.0,0.0,...,31329.500000,4.0,0,0,-0.193960,0.000000,B,720,2023-05-01 00:00:00,2.213565
1,2023-05-01 01:00:00,17.998889,4.3,1.283,271.600006,0.000000,0.000000,64.599998,0.0,0.0,...,30737.800781,3.9,0,0,-0.158623,0.000000,B,721,2023-05-01 01:00:00,2.303815
2,2023-05-01 02:00:00,18.998889,4.2,1.283,271.200012,0.000000,0.000000,94.300003,0.0,0.0,...,29863.199219,3.7,0,0,-0.096628,0.000000,B,722,2023-05-01 02:00:00,2.387730
3,2023-05-01 03:00:00,19.998889,4.1,1.282,270.799988,0.000000,0.000000,75.099998,0.0,0.0,...,33809.000000,3.6,0,0,-0.012200,0.000000,B,723,2023-05-01 03:00:00,6.071991
4,2023-05-01 04:00:00,20.998889,3.9,1.283,270.299988,56571.500000,19743.400391,58.700001,0.0,0.0,...,35603.398438,3.4,0,0,0.088860,76314.900391,B,724,2023-05-01 04:00:00,46.061150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2023-07-03 19:00:00,35.991389,8.3,1.197,281.500000,206778.500000,55664.000000,88.400002,0.0,0.0,...,43766.398438,2.6,0,0,0.177102,262442.500000,B,1435,2023-07-03 19:00:00,31.109562
716,2023-07-03 20:00:00,36.991389,8.4,1.200,281.700012,114277.000000,39711.699219,81.300003,0.0,0.0,...,43369.000000,2.3,0,0,0.086008,153988.699219,B,1436,2023-07-03 20:00:00,14.503678
717,2023-07-03 21:00:00,37.991389,8.7,1.203,282.100006,46271.101562,11528.700195,64.500000,0.0,0.0,...,44597.601562,2.7,0,0,0.012967,57799.801758,B,1437,2023-07-03 21:00:00,11.221365
718,2023-07-03 22:00:00,38.991389,9.0,1.206,282.600006,9413.900391,0.000000,97.000000,0.0,0.0,...,41415.699219,2.4,0,0,-0.037010,9413.900391,B,1438,2023-07-03 22:00:00,10.734447


In [64]:
# concatenate predictions
predictions = pd.concat(predictions)
predictions.index = predictions["id"]
predictions

Unnamed: 0_level_0,ds,estimated_diff_hours,absolute_humidity_2m:gm3,air_density_2m:kgm3,dew_point_2m:K,diffuse_rad_1h:J,direct_rad_1h:J,effective_cloud_cover:p,fresh_snow_1h:cm,snow_depth:cm,...,visibility:m,wind_speed_10m:ms,is_dew,is_rime,sin_sun_elevation,global_rad_1h:J,location,id,time,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2023-05-01 00:00:00,16.998889,4.4,1.286,271.700012,0.000000,0.000000,80.699997,0.0,0.0,...,30210.699219,4.0,0,0,-0.193978,0.000000,A,0,2023-05-01 00:00:00,24.753420
1,2023-05-01 01:00:00,17.998889,4.3,1.287,271.600006,0.000000,0.000000,64.500000,0.0,0.0,...,29507.500000,3.9,0,0,-0.158641,0.000000,A,1,2023-05-01 01:00:00,18.146330
2,2023-05-01 02:00:00,18.998889,4.2,1.284,271.200012,0.000000,0.000000,94.400002,0.0,0.0,...,29463.099609,3.7,0,0,-0.096645,0.000000,A,2,2023-05-01 02:00:00,25.271877
3,2023-05-01 03:00:00,19.998889,4.1,1.282,270.799988,0.000000,0.000000,75.000000,0.0,0.0,...,33727.101562,3.6,0,0,-0.012217,0.000000,A,3,2023-05-01 03:00:00,47.294090
4,2023-05-01 04:00:00,20.998889,3.9,1.282,270.299988,56574.300781,19781.400391,58.599998,0.0,0.0,...,35927.601562,3.4,0,0,0.088842,76355.701172,A,4,2023-05-01 04:00:00,308.338074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,2023-07-03 19:00:00,35.991389,8.3,1.196,281.600006,199266.906250,57163.699219,89.599998,0.0,0.0,...,41536.398438,2.2,0,0,0.176776,256430.605469,C,2155,2023-07-03 19:00:00,79.219185
2156,2023-07-03 20:00:00,36.991389,8.5,1.199,281.899994,109878.000000,39874.800781,84.400002,0.0,0.0,...,40136.500000,2.1,0,0,0.085556,149752.800781,C,2156,2023-07-03 20:00:00,45.387287
2157,2023-07-03 21:00:00,37.991389,8.8,1.202,282.200012,44498.898438,10678.299805,68.800003,0.0,0.0,...,43266.101562,2.4,0,0,0.012409,55177.198242,C,2157,2023-07-03 21:00:00,24.156162
2158,2023-07-03 22:00:00,38.991389,9.0,1.206,282.600006,8968.599609,0.000000,100.000000,0.0,0.0,...,39017.898438,2.0,0,0,-0.037638,8968.599609,C,2158,2023-07-03 22:00:00,12.095822


In [65]:



# Save the submission DataFrame to submissions folder, create new name based on last submission, format is submission_<last_submission_number + 1>.csv

# Save the submission
predictions = predictions[['id', 'prediction']]
predictions.to_csv(os.path.join('submissions', f"{new_filename}.csv"), index=False)
predictions

Unnamed: 0_level_0,id,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,24.753420
1,1,18.146330
2,2,25.271877
3,3,47.294090
4,4,308.338074
...,...,...
2155,2155,79.219185
2156,2156,45.387287
2157,2157,24.156162
2158,2158,12.095822
