In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

df = pd.read_excel("C:/Users/SK-TECH/Downloads/household_new.xlsx")

In [2]:
columns_to_drop = ["cet_cest_timestamp","area_offices","area_room_1","area_room_2","area_room_3","area_room_4","compressor","cooling_aggregate","cooling_pumps","dishwasher","ev","grid_import","pv_facade","pv_roof","refrigerator","ventilation"]
training_columns = ["utc_timestamp","power_diff", "machine_1", "machine_2", "machine_3", "machine_4", "machine_5"]
normal_column = ["utc_timestamp","machine_col" ,"power", "machine_id","power_diff" ]

In [3]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        # We store the parameters here
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        # Nothing to learn/calculate here, so we just return self
        return self

    def transform(self, X):
        # Return the dataframe with columns dropped
        return X.drop(columns=self.columns_to_drop)

In [4]:
class Drop_na(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X=X.copy()
        return X.dropna()

In [5]:
class Melt_data(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.melt(
            id_vars=["utc_timestamp"],
            value_vars= ["machine_1","machine_2","machine_3","machine_4","machine_5"],
            var_name="machine_col",
            value_name="power",
        )

In [6]:
class Sort(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.sort_values("utc_timestamp").reset_index(drop=True)

In [7]:
class Sort_For_Machine(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.sort_values(["machine_id", "utc_timestamp"]).reset_index(drop=True)

In [8]:
class Extract_machine_id(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["machine_id"] = X["machine_col"].str.replace("machine_", "").astype(int)
        return X

In [9]:
class Calculate_power_diff(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["power_diff"] = X.groupby("machine_id")["power"].diff()
        return X

In [10]:
class Parse_data(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["utc_timestamp"] = pd.to_datetime(X["utc_timestamp"])
        return X

In [None]:
# time scaling to hour and minute to have more meaningful features for the model, we will use the minute of the day as a feature, and we will scale it to be between 0 and 1
import numpy as np
class Time_Scaling(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["hour"] = X["utc_timestamp"].dt.hour
        X["hour_cos"] = np.cos(round(2 * np.pi * X["hour"] / 24, 4))
        X["hour_sin"] = np.sin(round(2 * np.pi * X["hour"] / 24, 4))
        X["is_night"] = ((X["hour"] >= 20) | (X["hour"] < 6)).astype(int)
        return X


In [11]:
ct = ColumnTransformer(
    transformers=[
        ('passthrough', 'passthrough', ['utc_timestamp', 'power_diff']),
        ('machine', OneHotEncoder(
            handle_unknown='ignore', sparse_output=False  # Dense for sanity
        ), ['machine_id'])
    ],
    remainder='drop'
)

In [12]:
# transform the data using the pipeline in function to used in other scripts


My_Pipeline = Pipeline([
    ("drop_columns", ColumnDropper(columns_to_drop)),  # placeholder for column dropping
    ("parse_dates", Parse_data()),    # placeholder for date parsing     # placeholder for sorting
    ("melt_data",Melt_data() ),      # placeholder for melting
    ("drop_na",Drop_na()),          # placeholder for dropping NA values
    # extract machine_id before sorting by machine_id
    ("extract_machine_id", Extract_machine_id()),  # placeholder for extracting machine_id
    ("sort_for_machine", Sort_For_Machine()),  # placeholder for sorting by machine_id and timestamp
    ("calculate_power_diff", Calculate_power_diff()),  # placeholder for calculating power_diff
    ("final_sort", Sort()),  # run final DataFrame sort BEFORE encoding so Sort sees a DataFrame
])

In [13]:
ct = ColumnTransformer(
    transformers=[
        ('passthrough', 'passthrough', ['utc_timestamp', 'power_diff']),
        ('machine', OneHotEncoder(
            handle_unknown='ignore', sparse_output=False  # Dense for sanity
        ), ['machine_id'])
    ],
    remainder='drop'
)

In [14]:
Final_Pipeline = Pipeline([
    ("first_pipeline", My_Pipeline),
    ("column_transformer", ct),
])

In [15]:
# save the processed data to joblib file
import joblib
joblib.dump(Final_Pipeline, "preprocessing_pipeline_new.joblib")

['preprocessing_pipeline_new.joblib']