In [97]:
from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from sklearn.experimental import enable_iterative_imputer  # noqa, isort: skip
from sklearn.impute import IterativeImputer  # isort: skip

In [98]:
SAVE_MODEL_DIR = Path(Path.cwd().parent, "models", "saved_model")
SAVED_FORECAST_DIR = Path(
    Path.cwd().parent,
    "data.nosync",
    "outputs",
    "weather_forecast",
    "forecast_2023_07_23.parquet",
)

In [99]:
model = tf.keras.models.load_model(SAVE_MODEL_DIR)

In [100]:
forecast_df = pd.read_parquet(SAVED_FORECAST_DIR)
forecast_df.head()

Unnamed: 0,Area,Lat,Lon,Grid Zone,PTID,Lat_Lon,Time Stamp,temp,Year,Month,Day,Minute,Hour,Max_Temp,Min_Temp
0,ALB,42.65258,-73.756233,CAPITL,61757.0,"[42.652580, -73.756233]",2023-07-23 00:00:00,19.9,2023,7,23,0,0,28.7,15.8
1,ALB,42.65258,-73.756233,CAPITL,61757.0,"[42.652580, -73.756233]",2023-07-23 01:00:00,19.0,2023,7,23,5,1,28.7,15.8
2,ALB,42.65258,-73.756233,CAPITL,61757.0,"[42.652580, -73.756233]",2023-07-23 02:00:00,18.2,2023,7,23,10,2,28.7,15.8
3,ALB,42.65258,-73.756233,CAPITL,61757.0,"[42.652580, -73.756233]",2023-07-23 03:00:00,17.4,2023,7,23,15,3,28.7,15.8
4,ALB,42.65258,-73.756233,CAPITL,61757.0,"[42.652580, -73.756233]",2023-07-23 04:00:00,16.7,2023,7,23,20,4,28.7,15.8


# Model preprocessing

In [101]:
class PreprocessingTransformer:
    """
    A custom transformer for X_train and X_test data,
    """

    def __init__(self, df: pd.DataFrame):
        self.df = df

    # Rounding data

    def round_data(self):
        """Round the Max, Min  temperatures to 2 decimal places.
        The original values have 6 sig figs which are inaccurate, unecessary, and may slow
        down future calculations.
        """
        self.df["Max_Temp"] = self.df["Max_Temp"].round(2)
        self.df["Min_Temp"] = self.df["Min_Temp"].round(2)

    def drop_unused_cols(self):
        """Drops the 'Name' column from the dataframe.
        Each name also has a corresponding numerical PTID which will be used for identification instead.
        """
        self.df = self.df.drop(
            ["Area", "Lat", "Lon", "Grid Zone", "Lat_Lon", "temp", "index"],
            axis=1,
            errors="ignore",
        ).reset_index()  # Each name has a unique PTID

    # Cyclical Transformations
    def encode_cyclical(self):
        def encode_sin_cos(
            data: pd.DataFrame, col: str, max_val: Union[int, float]
        ) -> pd.DataFrame:
            """Create two new columns within a given dataframe to encode specified cols with sin and cos transformations

            Args:
                data (pd.DataFrame): Dataframe containing cols to encode
                col (str): column to encode (month, yeah, minute etc)
                max_val (Union[int, float]): maximum value of the given column

            Returns:
                pd.DataFrame: original dataframe with additional columns
            """
            data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val)
            data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)
            return data

        self.df = encode_sin_cos(self.df, "Month", self.df["Month"].max())
        self.df = encode_sin_cos(self.df, "Day", self.df["Day"].max())
        self.df = encode_sin_cos(self.df, "Minute", self.df["Minute"].max())
        self.df = encode_sin_cos(self.df, "Hour", self.df["Hour"].max())

    # Handling dates
    def convert_dates_to_int(self, date_col: str = "Time Stamp"):
        """Converts a given timestamp column to integers

        Args:
            date_col (str, optional): Name of timestamp column in df. Defaults to "Time Stamp".
        """
        self.df[date_col] = self.df[date_col].astype(int)

    def convert_int_to_date(self, date_col: str = "Time Stamp"):
        """
        Converts a given ineteger column to timestamps
        Args:
            date_col (str, optional): Name of timestamp column in df. Defaults to "Time Stamp".
        """
        self.df[date_col] = pd.to_datetime(self.df[date_col])

    # Imputing
    def impute_missing_vals(self):
        """Imputes missing values using the temperature columns.
        Only missing rows in this dataset are in the min wet bulb column, therefore
        the data is only imputed using the other temperature columns.
        """
        imp = IterativeImputer(max_iter=5, random_state=0)
        df_temp = self.df[["Min_Temp", "Max_Temp", "Min Wet Bulb", "Max Wet Bulb"]]
        df_non_temp_cols = [x for x in self.df.columns.to_list() if x not in df_temp]
        df_temp = df_temp.reset_index(drop=True)
        df_non_temp = self.df[df_non_temp_cols]
        imputed = imp.fit_transform(df_temp)

        df_imputed = pd.DataFrame(imputed, columns=df_temp.columns)
        self.df = pd.concat([df_imputed, df_non_temp], axis=1)
        del df_imputed
        del df_non_temp
        assert self.df.isna().sum().sum() == 0

    # Scaling
    def scale_vals(self):
        """
        Scales all data with the standardscaler trnasformer.
        """
        transformer = StandardScaler()
        df_scaled = transformer.fit_transform(self.df)
        self.df = pd.DataFrame(df_scaled, columns=self.df.columns)
        del df_scaled

In [102]:
preprocessing = PreprocessingTransformer(forecast_df)
preprocessing.round_data()
preprocessing.encode_cyclical()
preprocessing.convert_dates_to_int()
preprocessing.drop_unused_cols()
preprocessing.scale_vals()

In [103]:
preprocessed_df = preprocessing.df
preprocessed_df = preprocessed_df.drop("index", axis=1, errors="ignore")

In [109]:
reorder_cols = [
    "Min_Temp",
    "Max_Temp",
    "Time Stamp",
    "PTID",
    "Year",
    "Month",
    "Day",
    "Minute",
    "Hour",
    "Month_sin",
    "Month_cos",
    "Day_sin",
    "Day_cos",
    "Minute_sin",
    "Minute_cos",
    "Hour_sin",
    "Hour_cos",
]
preprocessed_df = preprocessed_df[reorder_cols]

In [110]:
preprocessed_df.head()

Unnamed: 0,Min_Temp,Max_Temp,Time Stamp,PTID,Year,Month,Day,Minute,Hour,Month_sin,Month_cos,Day_sin,Day_cos,Minute_sin,Minute_cos,Hour_sin,Hour_cos
0,-0.206176,-0.696813,-1.708159,0.122169,0.0,0.0,-1.224745,-1.593255,-1.661325,0.0,0.0,-1.211291,-1.371354,-2.3044460000000002e-17,1.253566,5.031723e-17,1.330124
1,-0.206176,-0.696813,-1.660042,0.122169,0.0,0.0,-1.224745,-1.303572,-1.516862,0.0,0.0,-1.211291,-1.371354,0.7985794,1.036476,0.3897566,1.278655
2,-0.206176,-0.696813,-1.611925,0.122169,0.0,0.0,-1.224745,-1.01389,-1.372399,0.0,0.0,-1.211291,-1.371354,1.343616,0.454131,0.7506067,1.128065
3,-0.206176,-0.696813,-1.563808,0.122169,0.0,0.0,-1.224745,-0.724207,-1.227936,0.0,0.0,-1.211291,-1.371354,1.462063,-0.30858,1.055788,0.889522
4,-0.206176,-0.696813,-1.515691,0.122169,0.0,0.0,-1.224745,-0.434524,-1.083473,0.0,0.0,-1.211291,-1.371354,1.116316,-1.0095,1.282666,0.580718


In [112]:
preds = model.predict(preprocessed_df)



In [113]:
preprocessed_df["pred_load"] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_df['pred_load'] = preds


In [114]:
preprocessed_df

Unnamed: 0,Min_Temp,Max_Temp,Time Stamp,PTID,Year,Month,Day,Minute,Hour,Month_sin,Month_cos,Day_sin,Day_cos,Minute_sin,Minute_cos,Hour_sin,Hour_cos,pred_load
0,-0.206176,-0.696813,-1.708159,0.122169,0.0,0.0,-1.224745,-1.593255,-1.661325,0.0,0.0,-1.211291,-1.371354,-2.304446e-17,1.253566,5.031723e-17,1.330124,951.806641
1,-0.206176,-0.696813,-1.660042,0.122169,0.0,0.0,-1.224745,-1.303572,-1.516862,0.0,0.0,-1.211291,-1.371354,7.985794e-01,1.036476,3.897566e-01,1.278655,939.396729
2,-0.206176,-0.696813,-1.611925,0.122169,0.0,0.0,-1.224745,-1.013890,-1.372399,0.0,0.0,-1.211291,-1.371354,1.343616e+00,0.454131,7.506067e-01,1.128065,923.488037
3,-0.206176,-0.696813,-1.563808,0.122169,0.0,0.0,-1.224745,-0.724207,-1.227936,0.0,0.0,-1.211291,-1.371354,1.462063e+00,-0.308580,1.055788e+00,0.889522,911.917236
4,-0.206176,-0.696813,-1.515691,0.122169,0.0,0.0,-1.224745,-0.434524,-1.083473,0.0,0.0,-1.211291,-1.371354,1.116316e+00,-1.009500,1.282666e+00,0.580718,886.243652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1219,-0.559621,-0.448299,1.515691,-0.223977,0.0,0.0,1.224745,0.434524,1.083473,0.0,0.0,1.237770,0.984911,-1.116316e+00,-1.009500,-1.282666e+00,0.580718,1085.471436
1220,-0.559621,-0.448299,1.563808,-0.223977,0.0,0.0,1.224745,0.724207,1.227936,0.0,0.0,1.237770,0.984911,-1.462063e+00,-0.308580,-1.055788e+00,0.889522,1059.813232
1221,-0.559621,-0.448299,1.611925,-0.223977,0.0,0.0,1.224745,1.013890,1.372399,0.0,0.0,1.237770,0.984911,-1.343616e+00,0.454131,-7.506067e-01,1.128065,1014.822754
1222,-0.559621,-0.448299,1.660042,-0.223977,0.0,0.0,1.224745,1.303572,1.516862,0.0,0.0,1.237770,0.984911,-7.985794e-01,1.036476,-3.897566e-01,1.278655,954.227539
