In [None]:
# Temporarily suppress FutureWarning
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)

import pandas as pd
import numpy as np

# In module1.py
import sys
import os

# Get the absolute path of folder2
current_dir = os.path.dirname(__file__)
parent_dir = os.path.join(current_dir, os.pardir)
folder2_dir = os.path.join(parent_dir, 'ml_combat')

# Add folder2 to sys.path
sys.path.append(parent_dir)

from ml_combat.MetaModel import MetaModel
import ml_combat as ml

import random
import catboost as cb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold, TimeSeriesSplit, train_test_split

import pandas as pd
from abc import ABC, abstractmethod
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_absolute_error
import numpy as np
import statistics

In [None]:
class MetaModel(ABC):     
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = None

        return

    def test(self, df: pd.DataFrame, n_splits=5):
        """
            K-fold cross-validation, df must have y in it for testing against predictions
        """
        print(f"Testing {self.model_name}")
        column_names = df.columns.tolist()
        if 'y' not in column_names:
            raise Exception(f"Missing observed y in columns. Available are {column_names}")

        # This is unecessary because we already clean it when calling train
        # drop_y_with_na
        df = df.dropna(subset=['y'], inplace=False)

        MAE_values = []
        MSE_values = []

        # tscv = TimeSeriesSplit(n_splits=n_splits)
        kf =KFold(n_splits=n_splits, shuffle=True, random_state=42)

        for train_index, test_index in kf.split(df):
            train_partition = df.iloc[train_index]
            valid_partition = df.iloc[test_index]

            self.train(train_partition)
            predictions = self.predict(valid_partition)
            
            y_true = valid_partition['y']
            y_pred = predictions['y_pred']

            MAE = mean_absolute_error(y_true, y_pred)
            MAE_values.append(MAE)

            MSE_values.append((y_pred - y_true).mean())

            print(f"Run {len(MAE_values)} MAE =", MAE)

        print("Mean Signed Error vals", MSE_values)
        average_mae = statistics.mean(MAE_values)
        print("MAE Vals: MEAN:", average_mae, 'ALL:' , MAE_values)
        
        return MAE_values
    
    @abstractmethod
    def preprocess(df: pd.DataFrame) -> pd.DataFrame:
        """
            Takes in single-index (datetime as index) df, and returns df with only desired features
        """    
        pass
    
    @abstractmethod
    def train(df: pd.DataFrame):
        """

        """
        pass

    @abstractmethod
    def predict(df: pd.DataFrame):
        """
            Runs trained model on on input df, preprocessing the df first and then returns datetime and y_pred
        """
        pass

In [None]:
class CatBoostHenrik(MetaModel):
    
    def __init__(self, features=None, random_state=42):
        super().__init__("CatBoost")

        self.random_state = random_state

        if(features):
            self.features = features
        else:
            self.features = []
            self.features.extend(['sample_importance',
                                'dayofyear',
                                'hour',
                                'total_rad_1h:J',
                                'is_day:idx',
            'absolute_humidity_2m:gm3',
            'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
            'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
            'dew_point_2m:K', 'effective_cloud_cover:p', 'elevation:m',
            'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
            'fresh_snow_3h:cm', 'fresh_snow_6h:cm',
            'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
            'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
            'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
            'sfc_pressure:hPa', 'snow_density:kgm3', 'snow_depth:cm',
            'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2',
            'sun_azimuth:d', 'sun_elevation:d', 'super_cooled_liquid_water:kgm2',
            't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
            'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms',
            'wind_speed_w_1000hPa:ms'])

    def preprocess(self, df: pd.DataFrame):
        temp_df = df.copy()

        has_target = 'y' in temp_df.columns        
        
        ##################################################################################### 
        # FEATURE ENGINEERING
        #####################################################################################

         # Emphasize test start-end: Starting date: 2023-05-01 00:00:00 Ending data 2023-07-03 23:00:00
        temp_df['sample_importance'] = 1
        temp_df.loc[(temp_df['ds'].dt.month >= 5) & 
                    (temp_df['ds'].dt.month < 7), 'sample_importance'] = 2
        
        temp_df.loc[(temp_df['ds'].dt.month == 7) &
                    (temp_df['ds'].dt.day <= 4), 'sample_importance'] = 2
        
        # Add is_estimated parameter
        temp_df['is_estimated'] = (temp_df['weather_data_type'] == 'estimated')
        temp_df['is_estimated'] = temp_df['is_estimated'].astype(int)

        temp_df['total_rad_1h:J'] = temp_df['diffuse_rad_1h:J'] + temp_df['direct_rad_1h:J']    
        
        # Extracting hour-of-day and month, and making them cyclical
        temp_df['hour'] = temp_df['ds'].dt.hour
        temp_df['hour'] = (np.sin(2 * np.pi * (temp_df['hour'] - 4)/ 24) + 1) / 2

        temp_df['month'] = temp_df['ds'].dt.month
        temp_df['month'] = (np.sin(2 * np.pi * (temp_df['month'])/ 12) + 1) / 2

        temp_df['dayofyear'] = temp_df['ds'].dt.day_of_year
        temp_df['dayofyear'] = np.sin(2 * np.pi * (temp_df['dayofyear'] - 80)/ 365)
   
        # SETTING NAN TO 0 CONFORMING TO XGBOOST
        temp_df.fillna(0, inplace=True)
        #####################################################################################

        # DROPPING UNEEEDED FEATURES
        if(has_target):
            features_w_y = self.features + ['y']
            temp_df = temp_df[features_w_y]

        else:
            temp_df = temp_df[self.features]

        return temp_df

    def train(self, df):
        temp_df = self.preprocess(df)

        # Separate features and target
        X = temp_df.drop('y', axis=1, inplace=False).copy().values
        y = temp_df['y'].copy().values

        # Train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=self.random_state)

        # Parameters found through grid-search and some manual tuning (CatBoostRegressor.grid_search())
        params = {
            'objective': "MAE",
            'learning_rate': 0.02,
            'depth': 6,
            'iterations': 8000,
            'logging_level': 'Silent',
            'l2_leaf_reg': 5
        }

        self.model = cb.CatBoostRegressor(**params)
        
        self.model.fit(
             X_train,
             y_train,
             verbose=True,
             eval_set=(X_test, y_test),
        )

    def predict(self, df):
        """
        """
        df = self.preprocess(df)

        features = [col for col in df.columns if col != 'y']
        X = df[features].values
        y_preds = self.model.predict(X)

        # Set all negative predictions to 0
        y_preds = np.maximum(y_preds, 0)

        out_df = pd.DataFrame(data={'y_pred': y_preds})

        return out_df

In [None]:
class CatCompositeHenrik(MetaModel):
    
    def __init__(self, num_models=20):
        super().__init__("CatComposite Henrik")

        self.num_models = num_models
        self.common_features = ['sample_importance', 'is_estimated','dayofyear',
                                'is_day:idx',
                             'hour', 'month',
                            'total_rad_1h:J',
                            'sun_elevation:d',
                            'sun_azimuth:d',
                            'is_in_shadow:idx',
                            'effective_cloud_cover:p']
        
        self.random_features = ['absolute_humidity_2m:gm3',
                                'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
                                'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
                                'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
                                'direct_rad_1h:J', 'fresh_snow_3h:cm',
                                'precip_5min:mm','precip_type_5min:idx', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
                                'sfc_pressure:hPa','snow_water:kgm2',
                                'super_cooled_liquid_water:kgm2',
                                't_1000hPa:K', 'total_cloud_cover:p', 'visibility:m',
                                'wind_speed_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms']
             
    def preprocess(self, df: pd.DataFrame):
        return df.copy()
    
    def train(self, df: pd.DataFrame):
        num_models = self.num_models
        num_rand_features = round(len(self.random_features) * 0.9)

        df = df.copy()
        df['month'] = df['ds'].dt.month

        random_states = [i for i in range(33, 33 + num_models)]

        meta_train_df = df[(df['month'] == 5) | (df['month'] == 6) | (df['month'] == 7)].sample(frac=0.5)
        print("Meta-train % of full DF", len(meta_train_df)/len(df))

        # DUPLICATE MONTHS WE PREDICT
        selected_months = df[(df['month'] == 5) | (df['month'] == 6) | (df['month'] == 7)].copy()
        train_df = pd.concat([df, selected_months], ignore_index=True)

        features = dict()
        self.models = dict()

        for i in range(num_models):
            random.seed(random_states[i])
            temp_rand_features = random.sample(self.random_features, num_rand_features)
            features[i] = self.common_features + temp_rand_features
            self.models[f'CATBOOST_{i}'] = CatBoostHenrik(features = features[i], random_state=random_states[i])

        for key in self.models:
            print("Training model", key)
            self.models[key].train(train_df)     
    
    def predict(self, df):
        all_preds = None
        out_df = None

        # Get predictions from all the sub-models
        for key in self.models:
            y_pred = self.models[key].predict(df)['y_pred']
            if(all_preds is None):
                all_preds = pd.DataFrame(y_pred)
            else:
                all_preds[key] = y_pred.values

        # Take equally weighted average of models
        out_np = all_preds.mean(axis=1)

        # Make negative values 0
        out_np = np.maximum(out_np, 0)

        return pd.DataFrame(out_np, columns=['y_pred'])


In [None]:

df = ml.data.get_training_cleaned()

for location in ['A', 'B', 'C']:
    print(f"###############  LOCATION {location} ###############")
    df_location = df[df['location'] == location]
    
    cch = CatCompositeHenrik()
    cch.test(df_location)

In [None]:
# Generate submittable
ml.utils.make_submittable("CatComposite_new_features_x20_lr0.03_iters8000_nothing_random_CHECK.csv", model=CatCompositeHenrik(num_models=20))