## Short note - group 159


In [4]:
%load_ext autoreload

In [5]:
%autoreload

# load libraries
import pandas as pd
import numpy as np
import h2o
import sklearn
import matplotlib 

from h2o.automl import H2OAutoML



This code was run for our best score (159.03). If you run it on your machine, there might be some differences. We tried to run it for a few times and even with the same seed we could not get the same result. This might be a thing with the used models. We have noticed that they change every day. 

We have also noticed that any type of basic work with data - removing columns, NaNs etc. made the MAE even worse.

In a long note we tried even using the first two best models as predictors but without it was better.

One of the used model interpretation is generated by H2O itself.

In [4]:
# H2O

all_predictions = pd.DataFrame()
# Init of H2O
h2o.init(max_mem_size = "16G") # If u have more RAM change the parameter

for letter in ['A', 'B', 'C']:
    # Load data
    print(f"dataset {letter}")
    # Load data from Parquet files and concatenate them into a single DataFrame 'X'
    X = pd.concat([
        pd.read_parquet(f"../../dataset/{letter}/X_train_observed.parquet"),
        pd.read_parquet(f"../../dataset/{letter}/X_train_estimated.parquet")
    ], ignore_index=True)
    y = pd.read_parquet(f"../../dataset/{letter}/train_targets.parquet")
    
    # Preprocess the data
    
    # Merge DataFrame 'X' and 'y' based on the common column "date_forecast"
    X_y_bacon = X.merge(y, left_on = "date_forecast", right_on = "time")
    
    # Load test data and remove specified columns
    X_test = pd.read_parquet(f"../../dataset/{letter}/X_test_estimated.parquet")
    X_test = X_test.iloc[::4]
    #X_test.drop(columns=columns_to_drop, inplace=True)
    x_test = h2o.H2OFrame(X_test)

    
    # Split data into training and validation frames (80:20)
    train,test = sklearn.model_selection.train_test_split(X_y_bacon, test_size = 0.20)

    train_frame = h2o.H2OFrame(train)
    test_frame = h2o.H2OFrame(test)
    
    x = train_frame.columns[:-1] 
    y = train_frame.columns[-1] 
    
    # Create an AutoML model
    aml = H2OAutoML(max_runtime_secs = 60,
                    sort_metric = "MAE",
                    stopping_metric = "MAE",
                    seed = 7213712285) 
    aml.train(x = x, 
              y = y,
              training_frame = train_frame,
              validation_frame = test_frame)
    
    print(aml.leaderboard)
    best_model = aml.get_best_model(criterion='MAE')
    
    # Make predictions on test data and make CSV file
    prediction1 = best_model.predict(x_test)
    predictions1_df = h2o.as_list(prediction1)
    predictions1_df[predictions1_df < 0.] = 0.
    
    all_predictions = pd.concat([all_predictions, predictions1_df], ignore_index=True)
    all_predictions.to_csv('AutoML_H2O.csv', index_label='id')
    print("CSV file updated")

# Explain the best model on the validation frame for model interpretation
best_model.explain(test_frame)

# Shut down H2O
h2o.shutdown()
print("Done")