## Preprocessing notebook, remove bad points

In [1]:
import sys
import os

# Get the absolute path of parent folder
current_dir = os.path.abspath("")
parent_dir = os.path.join(current_dir, os.pardir)

# Add to sys.path
sys.path.append(parent_dir)

In [2]:
import ml_combat as ml
from ml_combat import data
from ml_combat.MetaModel import MetaModel

#### Imports

In [3]:
from prophet import Prophet
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose


In [4]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [22]:


class AutoGluonJacob(MetaModel):
    
    def __init__(self):
        super().__init__("AutoGluon Jacob")

        # autogluon features
        # TabularPredictor (usage : **params_TabularPredictor)
        self.params_TabularPredictor = \
            {
                'label': 'y',
                'problem_type': 'regression', 
                'eval_metric': 'mean_absolute_error',
                'verbosity': 0,
            } 
        # TabularPredictor.fit
        self.params_TabularPredictor_fit = \
            {
                'time_limit': 60*2.5,
                'presets': 'high_quality', # [‘best_quality’, ‘high_quality’, ‘good_quality’, ‘medium_quality’, ‘optimize_for_deployment’, ‘interpretable’, ‘ignore_text’]
                'hyperparameters': 'default',
                # 'auto_stack': False,
                # 'num_bag_folds': None, # set automatically by auto_stack True
                # 'num_bag_sets': None, # set to 20 because of auto_stack
                # 'num_stack_levels': None, # set automatically by auto_stack True
                'hyperparameter_tune_kwargs': 'auto', # None to disable
                # 'refit_full': True,
                # 'feature_prune_kwargs': {}, # If None, do not perform feature pruning. If empty dictionary, perform feature pruning with default configurations.
            }

        self.use_tuning_data = True # 'sample_weight', 'random'
        self.use_sample_weight = True

        if self.use_sample_weight: # auto_weight a feature that exists
            self.params_TabularPredictor['sample_weight'] = 'sample_importance'
        

    def preprocess(self, df: pd.DataFrame):
        """
        """
        temp_df = df.copy()

        temp_df['total_rad_1h:J'] = temp_df['diffuse_rad_1h:J'] + temp_df['direct_rad_1h:J']    
        
        # Extracting hour-of-day and month, and making them cyclical
        temp_df['hour'] = temp_df['ds'].dt.hour
        temp_df['hour'] = (np.sin(2 * np.pi * (temp_df['hour'] - 4)/ 24) + 1) / 2

        temp_df['dayofyear'] = temp_df['ds'].dt.day_of_year
        temp_df['dayofyear'] = np.sin(2 * np.pi * (temp_df['dayofyear'] - 80)/ 365)

        # temp_df['year'] = temp_df['ds'].dt.hour
        temp_df['month'] = temp_df['ds'].dt.month
        # temp_df['day'] = temp_df['ds'].dt.day
        # temp_df['dayofweek'] = temp_df['ds'].dt.dayofweek

        if self.use_sample_weight:
            # Emphasize test start-end: Starting date: 2023-05-01 00:00:00 Ending data 2023-07-03 23:00:00
            temp_df['sample_importance'] = 1
            temp_df.loc[(temp_df['ds'].dt.month >= 5) & 
                        (temp_df['ds'].dt.month < 7), 'sample_importance'] = 2
            
            # temp_df.loc[(temp_df['ds'].dt.month == 7) &
            #             (temp_df['ds'].dt.day <= 4), 'sample_importance'] = 2



        return temp_df.drop(columns=['ds'])

    def train(self, df):
        """
        """
        temp_df = self.preprocess(df)

        if self.use_tuning_data:

            tuning_data = temp_df[(temp_df['month'] == 5) | (temp_df['month'] == 6)].sample(frac=0.5, random_state=42)
            train_data = TabularDataset(temp_df[~temp_df.isin(tuning_data.to_dict(orient='list')).all(1)])

            self.model = TabularPredictor(**self.params_TabularPredictor).fit(train_data, tuning_data=tuning_data, use_bag_holdout=True, **self.params_TabularPredictor_fit)
        else:
            train_data = TabularDataset(temp_df)

            self.model = TabularPredictor(**self.params_TabularPredictor).fit(train_data, **self.params_TabularPredictor_fit)

    def predict(self, df):
        """
        """
        df = self.preprocess(df)

        features = [col for col in df.columns if col != 'y']
        X = df[features]



        y_preds = self.model.predict(X)
        print("AUTOGLUON MODEL OVERVIEW:")
        print(self.model.get_model_names())
       

        out_df = pd.DataFrame(data={'y_pred': y_preds})

        return out_df
    


In [16]:


df = ml.data.get_training_cleaned()

for location in ['A']:#, 'B', 'C']:
    print("###########################################")
    print(f"###############  LOCATION {location} ###############")
    print("###########################################")
    df_location = df[df['location'] == location]

    agh = AutoGluonJacob()
    agh.test(df_location, n_splits=2)



###########################################
###############  LOCATION A ###############
###########################################
Testing AutoGluon Jacob


KeyboardInterrupt: 

## Results

    self.params_TabularPredictor = \
        {
            'label': 'y',
            'problem_type': 'regression', 
            'eval_metric': 'mean_absolute_error',
            'verbosity': 0,
            'sample_weight': None
        } 
    # TabularPredictor.fit
    self.params_TabularPredictor_fit = \
        {
            'time_limit': 60,
            'presets': 'good_quality', # [‘best_quality’, ‘high_quality’, ‘good_quality’, ‘medium_quality’, ‘optimize_for_deployment’, ‘interpretable’, ‘ignore_text’]
            'hyperparameters': 'default',
            # 'auto_stack': False,
            # 'num_bag_folds': None, # set automatically by auto_stack True
            # 'num_bag_sets': None, # set to 20 because of auto_stack
            # 'num_stack_levels': None, # set automatically by auto_stack True
            # 'hyperparameter_tune_kwargs': 'random', # None to disable
            # 'refit_full': True,
            # 'feature_prune_kwargs': None#{}, # If None, do not perform feature pruning. If empty dictionary, perform feature pruning with default configurations.
        }

    self.use_tuning_data = False
MAE Vals: MEAN: 164.62250084174644 ALL: [164.31157948281893, 164.933422200674]


    self.use_tuning_data = True
    tuning_data = temp_df.sample(frac=0.05, random_state=42)
MAE Vals: MEAN: 166.73566744849495 ALL: [168.50105388270137, 164.97028101428853]

    sample seed 142
MAE Vals: MEAN: 165.69652975339554 ALL: [167.9125409043826, 163.48051860240847]

    sample seed 42
    self.use_tuning_data = False
    sample_weight = True
MAE Vals: MEAN: 166.19670395329996 ALL: [167.63860398559584, 164.75480392100408]

    self.use_tuning_data = True
    tuning_data = temp_df[(temp_df['month'] == 5) | (temp_df['month'] == 6)].sample(frac=0.05, random_state=142)
    sample_weight = True
MAE Vals: MEAN: 167.18372152573014 ALL: [170.1052116231583, 164.26223142830196]

    'feature_prune_kwargs': {}
MAE Vals: MEAN: 166.19423848616486 ALL: [168.88409245902324, 163.50438451330652]

#### Switch to seeded KFold

    'hyperparameter_tune_kwargs': 'random'
MAE Vals: MEAN: 175.01220468483484 ALL: [177.53382030663508, 172.49058906303463]

    'feature_prune_kwargs': None
    self.use_tuning_data = False
    self.use_sample_weight = False
MAE Vals: MEAN: 172.1411154741047 ALL: [172.74787350507623, 171.5343574431332]


    'hyperparameter_tune_kwargs': None
MAE Vals: MEAN: 165.01777919495962 ALL: [165.55820916266555, 164.47734922725368]

    'feature_prune_kwargs': {}
MAE Vals: MEAN: 165.0372890394286 ALL: [165.5598301875853, 164.5147478912719]

    'time_limit': 180,
MAE Vals: MEAN: 165.37766162641057 ALL: [164.87359981161345, 165.8817234412077]

In [23]:
ml.utils.make_submittable("GluonW_HPO_and_tuning_data_and_sample_weighting_2x.csv", model=AutoGluonJacob())

# df = pd.read_csv(ml.module_dir + '/../submissions/GluonW_HPO_and_tuning_data.csv').merge(pd.read_csv(ml.module_dir + '/../submissions/XGBoostComposite_laptop.csv'), left_index=True, right_index=True)
# abs(df.prediction_x - df.prediction_y).mean()

AUTOGLUON MODEL OVERVIEW:
['KNeighborsUnif_BAG_L1', 'KNeighborsDist_BAG_L1', 'LightGBMXT_BAG_L1/T1', 'LightGBM_BAG_L1/T1', 'RandomForestMSE_BAG_L1', 'CatBoost_BAG_L1/T1', 'ExtraTreesMSE_BAG_L1', 'XGBoost_BAG_L1/T1', 'LightGBMLarge_BAG_L1', 'WeightedEnsemble_L2', 'LightGBMXT_BAG_L2/T1', 'LightGBM_BAG_L2/T1', 'CatBoost_BAG_L2/T1', 'ExtraTreesMSE_BAG_L2', 'XGBoost_BAG_L2/T1', 'LightGBMLarge_BAG_L2', 'WeightedEnsemble_L3', 'KNeighborsUnif_BAG_L1_FULL', 'KNeighborsDist_BAG_L1_FULL', 'LightGBMXT_BAG_L1/T1_FULL', 'LightGBM_BAG_L1/T1_FULL', 'RandomForestMSE_BAG_L1_FULL', 'CatBoost_BAG_L1/T1_FULL', 'ExtraTreesMSE_BAG_L1_FULL', 'XGBoost_BAG_L1/T1_FULL', 'LightGBMLarge_BAG_L1_FULL', 'WeightedEnsemble_L2_FULL', 'LightGBMXT_BAG_L2/T1_FULL', 'LightGBM_BAG_L2/T1_FULL', 'CatBoost_BAG_L2/T1_FULL', 'ExtraTreesMSE_BAG_L2_FULL', 'XGBoost_BAG_L2/T1_FULL', 'LightGBMLarge_BAG_L2_FULL', 'WeightedEnsemble_L3_FULL']
