## Preprocessing notebook, remove bad points

In [1]:
import sys
import os

# Get the absolute path of parent folder
current_dir = os.path.abspath("")
parent_dir = os.path.join(current_dir, os.pardir)

# Add to sys.path
sys.path.append(parent_dir)

In [2]:
import ml_combat as ml
from ml_combat import data
from ml_combat.MetaModel import MetaModel

#### Imports

In [13]:
from prophet import Prophet
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.model_selection import train_test_split


In [4]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [15]:


class AutoGluonJacob(MetaModel):
    
    def __init__(self):
        super().__init__("AutoGluon Jacob")

        # autogluon features
        # TabularPredictor (usage : **params_TabularPredictor)
        self.params_TabularPredictor = \
            {
                'label': 'y',
                'problem_type': 'regression', 
                'eval_metric': 'mean_absolute_error',
                'verbosity': 1,
            } 
        # TabularPredictor.fit
        self.params_TabularPredictor_fit = \
            {
                'time_limit': 60*5,
                'presets': 'high_quality', # [‘best_quality’, ‘high_quality’, ‘good_quality’, ‘medium_quality’, ‘optimize_for_deployment’, ‘interpretable’, ‘ignore_text’]
                'hyperparameters': 'default',
                # 'auto_stack': False,
                # 'num_bag_folds': None, # set automatically by auto_stack True
                # 'num_bag_sets': None, # set to 20 because of auto_stack
                # 'num_stack_levels': 3, # set automatically by auto_stack True
                'hyperparameter_tune_kwargs': 'auto', # None to disable
                # 'refit_full': True,
                # 'feature_prune_kwargs': {}, # If None, do not perform feature pruning. If empty dictionary, perform feature pruning with default configurations.
            }

        self.use_tuning_data = True # 'sample_weight', 'random'
        self.use_sample_weight = False

        if self.use_sample_weight: # auto_weight a feature that exists
            self.params_TabularPredictor['sample_weight'] = 'sample_importance'
        

    def preprocess(self, df: pd.DataFrame):
        """
        """
        temp_df = df.copy()

        temp_df['total_rad_1h:J'] = temp_df['diffuse_rad_1h:J'] + temp_df['direct_rad_1h:J']    
        
        # Extracting hour-of-day and month, and making them cyclical
        temp_df['hour'] = temp_df['ds'].dt.hour
        temp_df['hour'] = (np.sin(2 * np.pi * (temp_df['hour'] - 4)/ 24) + 1) / 2

        temp_df['dayofyear'] = temp_df['ds'].dt.day_of_year
        temp_df['dayofyear'] = np.sin(2 * np.pi * (temp_df['dayofyear'] - 80)/ 365)

        # temp_df['year'] = temp_df['ds'].dt.hour
        temp_df['month'] = temp_df['ds'].dt.month
        # temp_df['day'] = temp_df['ds'].dt.day
        # temp_df['dayofweek'] = temp_df['ds'].dt.dayofweek

        if self.use_sample_weight:
            # Emphasize test start-end: Starting date: 2023-05-01 00:00:00 Ending data 2023-07-03 23:00:00
            temp_df['sample_importance'] = 1
            temp_df.loc[(temp_df['ds'].dt.month >= 5) & 
                        (temp_df['ds'].dt.month < 7), 'sample_importance'] = 2
            
            # temp_df.loc[(temp_df['ds'].dt.month == 7) &
            #             (temp_df['ds'].dt.day <= 4), 'sample_importance'] = 2



        return temp_df.drop(columns=['ds'])

    def train(self, df):
        """
        """
        temp_df = self.preprocess(df)

        if self.use_tuning_data:

            # tuning_data = temp_df[(temp_df['month'] == 5) | (temp_df['month'] == 6)].sample(frac=0.5, random_state=42)
            # train_data = TabularDataset(temp_df[~temp_df.isin(tuning_data.to_dict(orient='list')).all(1)])
            train_data, tuning_data = train_test_split(df, test_size=0.1, random_state=42)
            train_data = TabularDataset(train_data)

            self.model = TabularPredictor(**self.params_TabularPredictor).fit(train_data, tuning_data=tuning_data, use_bag_holdout=True, **self.params_TabularPredictor_fit)
        else:
            train_data = TabularDataset(temp_df)

            self.model = TabularPredictor(**self.params_TabularPredictor).fit(train_data, **self.params_TabularPredictor_fit)

    def predict(self, df):
        """
        """
        df = self.preprocess(df)

        features = [col for col in df.columns if col != 'y']
        X = df[features]



        y_preds = self.model.predict(X)
        print("AUTOGLUON MODEL OVERVIEW:")
        print(self.model.leaderboard())
       

        out_df = pd.DataFrame(data={'y_pred': y_preds})

        return out_df
    


In [6]:


# df = ml.data.get_training_cleaned()

# for location in ['A']:#, 'B', 'C']:
#     print("###########################################")
#     print(f"###############  LOCATION {location} ###############")
#     print("###########################################")
#     df_location = df[df['location'] == location]

#     agh = AutoGluonJacob()
#     agh.test(df_location, n_splits=2)



## Results

    self.params_TabularPredictor = \
        {
            'label': 'y',
            'problem_type': 'regression', 
            'eval_metric': 'mean_absolute_error',
            'verbosity': 0,
            'sample_weight': None
        } 
    # TabularPredictor.fit
    self.params_TabularPredictor_fit = \
        {
            'time_limit': 60,
            'presets': 'good_quality', # [‘best_quality’, ‘high_quality’, ‘good_quality’, ‘medium_quality’, ‘optimize_for_deployment’, ‘interpretable’, ‘ignore_text’]
            'hyperparameters': 'default',
            # 'auto_stack': False,
            # 'num_bag_folds': None, # set automatically by auto_stack True
            # 'num_bag_sets': None, # set to 20 because of auto_stack
            # 'num_stack_levels': None, # set automatically by auto_stack True
            # 'hyperparameter_tune_kwargs': 'random', # None to disable
            # 'refit_full': True,
            # 'feature_prune_kwargs': None#{}, # If None, do not perform feature pruning. If empty dictionary, perform feature pruning with default configurations.
        }

    self.use_tuning_data = False
MAE Vals: MEAN: 164.62250084174644 ALL: [164.31157948281893, 164.933422200674]


    self.use_tuning_data = True
    tuning_data = temp_df.sample(frac=0.05, random_state=42)
MAE Vals: MEAN: 166.73566744849495 ALL: [168.50105388270137, 164.97028101428853]

    sample seed 142
MAE Vals: MEAN: 165.69652975339554 ALL: [167.9125409043826, 163.48051860240847]

    sample seed 42
    self.use_tuning_data = False
    sample_weight = True
MAE Vals: MEAN: 166.19670395329996 ALL: [167.63860398559584, 164.75480392100408]

    self.use_tuning_data = True
    tuning_data = temp_df[(temp_df['month'] == 5) | (temp_df['month'] == 6)].sample(frac=0.05, random_state=142)
    sample_weight = True
MAE Vals: MEAN: 167.18372152573014 ALL: [170.1052116231583, 164.26223142830196]

    'feature_prune_kwargs': {}
MAE Vals: MEAN: 166.19423848616486 ALL: [168.88409245902324, 163.50438451330652]

#### Switch to seeded KFold

    'hyperparameter_tune_kwargs': 'random'
MAE Vals: MEAN: 175.01220468483484 ALL: [177.53382030663508, 172.49058906303463]

    'feature_prune_kwargs': None
    self.use_tuning_data = False
    self.use_sample_weight = False
MAE Vals: MEAN: 172.1411154741047 ALL: [172.74787350507623, 171.5343574431332]


    'hyperparameter_tune_kwargs': None
MAE Vals: MEAN: 165.01777919495962 ALL: [165.55820916266555, 164.47734922725368]

    'feature_prune_kwargs': {}
MAE Vals: MEAN: 165.0372890394286 ALL: [165.5598301875853, 164.5147478912719]

    'time_limit': 180,
MAE Vals: MEAN: 165.37766162641057 ALL: [164.87359981161345, 165.8817234412077]

In [16]:
ml.utils.make_submittable("GluonW_HPO_and_tuning_data_train_test_split.csv", model=AutoGluonJacob())

# df = pd.read_csv(ml.module_dir + '/../submissions/GluonW_HPO_and_tuning_data.csv').merge(pd.read_csv(ml.module_dir + '/../submissions/XGBoostComposite_laptop.csv'), left_index=True, right_index=True)
# abs(df.prediction_x - df.prediction_y).mean()

No model was trained during hyperparameter tuning NeuralNetTorch_BAG_L2... Skipping this model.


KeyError: "1 required columns are missing from the provided dataset to transform using AutoMLPipelineFeatureGenerator. 1 missing columns: ['ds'] | 51 available columns: ['location', 'weather_data_type', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx', 'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m', 'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms', 'total_rad_1h:J', 'hour', 'dayofyear', 'month']"

In [10]:
df = ml.data.get_training_cleaned()
df = df[df.location == 'A']

In [11]:
from sklearn.model_selection import train_test_split

# Now, you can use train_test_split to split the filtered training data
train_data, test_data = train_test_split(df, test_size=0.5, random_state=42)

In [14]:
test_data

Unnamed: 0,location,ds,y,weather_data_type,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,...,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms
26243,A,2022-05-31 09:00:00,5052.30,observed,9.325,1.21300,3777.574951,2.563408e+06,750.824997,1758.649994,...,141.700752,44.165000,0.0,287.050003,72.299999,32652.650391,1.725,-1.500,-0.850,0.0
5784,A,2020-01-29 22:00:00,0.00,observed,4.000,1.25550,1512.200012,0.000000e+00,0.000000,1512.200012,...,338.362495,-43.226500,0.0,276.000000,99.400002,47307.076172,2.750,-2.750,0.100,0.0
13540,A,2020-12-18 02:00:00,0.00,observed,4.725,1.27900,,0.000000e+00,0.000000,1835.199982,...,60.043750,-39.467251,0.0,277.250000,18.724999,49143.423828,1.750,-1.600,0.600,0.0
28716,A,2022-09-11 10:00:00,1382.48,observed,8.450,1.23650,1017.899979,1.760569e+06,512.499992,369.474998,...,164.511250,30.331500,0.0,282.500000,78.075003,36580.974609,0.500,-0.350,-0.375,0.0
5377,A,2020-01-12 23:00:00,0.00,observed,3.550,1.26425,1862.250031,0.000000e+00,0.000000,1862.250031,...,179.783257,-48.229250,0.0,276.675011,97.875000,54268.250000,3.400,1.000,3.250,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18285,A,2021-07-03 19:00:00,338.80,observed,11.475,1.20250,,4.869624e+05,87.324999,,...,306.243996,8.297000,0.0,290.300003,3.550000,47196.700195,4.325,4.300,-0.400,0.0
18841,A,2021-07-26 23:00:00,0.00,observed,9.750,1.20000,,0.000000e+00,0.000000,,...,179.517504,-7.056750,0.0,295.550003,0.175000,53892.149414,2.750,-2.100,1.800,0.0
25198,A,2022-04-17 20:00:00,0.00,observed,5.975,1.26950,7216.400024,0.000000e+00,0.000000,5583.500000,...,316.535500,-8.953500,0.0,284.150002,87.400002,32406.649902,0.600,-0.025,0.600,0.0
22465,A,2021-12-24 23:00:00,0.00,observed,3.425,1.29450,3423.950012,0.000000e+00,0.000000,350.750000,...,181.709994,-49.875501,0.0,270.799995,92.000000,25039.450195,4.450,4.275,1.250,0.0
