Célula 1 — Configuração inicial
Carregamento das bibliotecas Python

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
# Redefine regression_metrics to fix the 'squared' argument issue
from xgboost import XGBRegressor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

Célula 2 — Definição das colunas

In [2]:
# Electrical and harmonic features (as in Sensors-25-04601)
ELECTRICAL_COLS = [
    "irms", "vrms", "power_factor",
    "p_apparente", "p_active"
]

HARMONIC_COLS = [f"h{i}" for i in range(1, 33)]

FEATURE_COLS = ELECTRICAL_COLS + HARMONIC_COLS

Célula 3 — Loader oficial do dataset

In [3]:
def load_nilm_dataset(data_root):
    """
    Loads the NILM dataset organized by acquisition sessions and sensor-phase files.

    Dataset structure:
    data/
      ├── session_1/
      │    ├── S1P1.csv
      │    ├── ...
      ├── session_2/
           ├── ...

    Returns
    -------
    pd.DataFrame
        Unified dataframe with metadata columns.
    """
    data_root = Path(data_root)
    dfs = []

    for session_dir in sorted(data_root.iterdir()):
        if not session_dir.is_dir():
            continue

        session_name = session_dir.name

        for csv_file in sorted(session_dir.glob("*.csv")):
            filename = csv_file.stem  # e.g., S1P1
            sensor = filename.split("P")[0]      # S1
            phase = "P" + filename.split("P")[1] # P1

            df = pd.read_csv(csv_file)

            # Keep only relevant columns
            df = df[["time"] + FEATURE_COLS]

            # Metadata (important for analysis)
            df["session"] = session_name
            df["sensor"] = sensor
            df["phase"] = phase
            df["channel"] = f"{sensor}_{phase}"

            dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

Célula 4 — Carregamento do dataset no Colab

In [4]:
# If using Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/nilm-dataset-main/nilm-dataset-main/data"  # ajuste se necessário

df = load_nilm_dataset(DATA_PATH)

print("Dataset loaded successfully!")
print("Shape:", df.shape)
df.head()


Dataset loaded successfully!
Shape: (1612718, 42)


Unnamed: 0,time,irms,vrms,power_factor,p_apparente,p_active,h1,h2,h3,h4,...,h27,h28,h29,h30,h31,h32,session,sensor,phase,channel
0,124,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,05-12 8h,S1,P1,S1_P1
1,134,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,05-12 8h,S1,P1,S1_P1
2,144,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,05-12 8h,S1,P1,S1_P1
3,146,0.0,224.9,0.21,1,0,0.000816,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,05-12 8h,S1,P1,S1_P1
4,148,0.0,225.84,0.26,1,0,0.000816,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,05-12 8h,S1,P1,S1_P1


Célula 5 — Validação

In [5]:
assert all(col in df.columns for col in FEATURE_COLS), "Missing feature columns"
assert df.isnull().sum().sum() == 0, "Dataset contains missing values"

df.describe()

Unnamed: 0,time,irms,vrms,power_factor,p_apparente,p_active,h1,h2,h3,h4,...,h23,h24,h25,h26,h27,h28,h29,h30,h31,h32
count,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,...,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0,1612718.0
mean,12858.49,0.2986589,239.1017,0.380029,71.15135,58.95274,0.1282875,0.004102651,0.03699963,0.001654518,...,0.00311622,0.000319259,0.002793487,0.0002743226,0.002315361,0.0002733475,0.001800771,0.0002670634,0.001627056,0.0002722632
std,7746.003,0.9129399,4.105178,0.3549899,213.4926,210.977,0.4519433,0.02587412,0.05607231,0.008418493,...,0.004776471,0.001035078,0.004339745,0.0007642839,0.003710752,0.00076293,0.002936258,0.0007873431,0.002513844,0.0006495557
min,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6256.0,0.0,238.05,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12492.0,0.04,239.55,0.49,8.0,4.0,0.010613,0.0,0.008804,0.0,...,0.000998,0.0,0.000916,0.0,0.000698,0.0,0.000408,0.0,0.00058,0.0
75%,18935.0,0.19,240.81,0.67,45.0,40.0,0.088988,0.000711,0.028242,0.000704,...,0.003044,0.000408,0.002296,0.000408,0.001534,0.000408,0.001316,0.000407,0.001415,0.000409
max,28924.0,15.15,245.93,1.0,3239.0,3225.0,7.847237,0.225221,0.408677,0.097276,...,0.046089,0.045215,0.043593,0.042538,0.041018,0.040459,0.039192,0.038987,0.037916,0.037357


Célula 6 — Preparação para treino

In [6]:
X = df[FEATURE_COLS].values
y = df["p_active"].values  # or appliance label if available


Célula 7 — Split por sessão

In [7]:
from sklearn.model_selection import train_test_split

train_sessions = df["session"].unique()[:int(0.7 * df["session"].nunique())]

train_idx = df["session"].isin(train_sessions)
test_idx  = ~train_idx

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

Célula 8 — Normalização e scaling

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

Célula 9 — Métricas padrão NILM

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np # Import numpy for sqrt

def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred) # Calculate MSE
    rmse = np.sqrt(mse) # Take square root for RMSE
    nrmse = rmse / (y_true.max() - y_true.min())
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, nrmse, r2

1. Linear Regression (baseline)

In [26]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mae_lr, rmse_lr, nrmse_lr, r2_lr = regression_metrics(y_test, y_pred_lr)

In [27]:
mae_lr, rmse_lr, nrmse_lr, r2_lr


(5.454709424407734e-14,
 np.float64(1.1574391255632856e-13),
 np.float64(4.006365959028334e-17),
 1.0)

2. KNN Regressor

In [22]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5, weights="distance")
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
mae_knn, rmse_knn, nrmse_knn, r2_knn = regression_metrics(y_test, y_pred_knn)

In [23]:
mae_knn, rmse_knn, nrmse_knn, r2_knn

(0.042942699740298705,
 np.float64(0.40833828482527174),
 np.float64(0.00014134243157676418),
 0.9999961152955421)

3. Decision Tree Regressor

In [20]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=10, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
mae_dt, rmse_dt, nrmse_dt, r2_dt = regression_metrics(y_test, y_pred_dt)


In [21]:
mae_dt, rmse_dt, nrmse_dt, r2_dt

(0.05800850500350839,
 np.float64(1.0204319626195448),
 np.float64(0.00035321286348893903),
 0.9999757402740657)

4. Random Forest Regressor

In [18]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
mae_rf, rmse_rf, nrmse_rf, r2_rf = regression_metrics(y_test, y_pred_rf)


In [19]:
mae_rf, rmse_rf, nrmse_rf, r2_rf

(0.013933373837777886,
 np.float64(0.5539807323516417),
 np.float64(0.0001917551859991837),
 0.9999928499825188)

5. Gradient Boosting Regressor

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)
mae_gbr, rmse_gbr, nrmse_gbr, r2_gbr = regression_metrics(y_test, y_pred_gbr)


In [17]:
mae_gbr, rmse_gbr, nrmse_gbr, r2_gbr

(0.049121052234671184,
 np.float64(0.33413370367223505),
 np.float64(0.00011565721830122362),
 0.9999973988921923)

6. XGBoost Regressor

In [14]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    objective="reg:squarederror",
    random_state=42
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
mae_xgb, rmse_xgb, nrmse_xgb, r2_xgb = regression_metrics(y_test, y_pred_xgb)


In [15]:
mae_xgb, rmse_xgb, nrmse_xgb, r2_xgb

(0.5797379016876221,
 np.float64(9.633087723536804),
 np.float64(0.0033344021196042932),
 0.997838020324707)

7. LightGBM Regressor

In [12]:
import lightgbm as lgb

lgbm = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
mae_lgbm, rmse_lgbm, nrmse_lgbm, r2_lgbm = regression_metrics(y_test, y_pred_lgbm)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.282681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9246
[LightGBM] [Info] Number of data points in the train set: 1064630, number of used features: 37
[LightGBM] [Info] Start training from score 58.951282




In [13]:
mae_lgbm, rmse_lgbm, nrmse_lgbm, r2_lgbm

(0.17686805104438194,
 np.float64(1.8541851573968005),
 np.float64(0.0006418086387666323),
 0.9999199015736179)

8. MLP Regressor

In [10]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    max_iter=300,
    random_state=42
)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
mae_mlp, rmse_mlp, nrmse_mlp, r2_mlp = regression_metrics(y_test, y_pred_mlp)


In [11]:
mae_mlp, rmse_mlp, nrmse_mlp, r2_mlp

(0.02974392013132596,
 np.float64(0.06796671683879636),
 np.float64(2.3526035596675792e-05),
 0.9999998923756933)

9. 2seqpoint

In [None]:
def create_seq2point_windows(X, y, window_size=99):
    X_seq, y_seq = [], []
    half = window_size // 2

    for i in range(half, len(X) - half):
        X_seq.append(X[i-half:i+half+1])
        y_seq.append(y[i])

    return np.array(X_seq), np.array(y_seq)

WINDOW_SIZE = 49
BATCH_SIZE = 64
X = X.astype(np.float32)
y = y.astype(np.float32)


X_train_seq, y_train_seq = create_seq2point_windows(X_train_scaled, y_train, WINDOW_SIZE)
X_test_seq,  y_test_seq  = create_seq2point_windows(X_test_scaled,  y_test,  WINDOW_SIZE)



class Seq2Point(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.conv1 = nn.Conv1d(n_features, 30, kernel_size=10)
        self.conv2 = nn.Conv1d(30, 30, kernel_size=8)
        #self.fc1 = nn.Linear(30 * 82, 1024)
        self.fc1 = nn.Linear(990, 1024)

        self.fc2 = nn.Linear(1024, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        return self.fc2(x).squeeze()

model = Seq2Point(X_train_seq.shape[2])

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

train_loader = DataLoader(
    TensorDataset(
        torch.tensor(X_train_seq, dtype=torch.float32),
        torch.tensor(y_train_seq, dtype=torch.float32)
    ),
    batch_size=64,
    shuffle=True
)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    y_pred_seq = model(
        torch.tensor(X_test_seq, dtype=torch.float32).to(device)
    ).cpu().numpy()

mae_seq, rmse_seq, nrmse_seq, r2_seq = regression_metrics(y_test_seq, y_pred_seq)

Tabela Final

In [28]:
import pandas as pd

results = pd.DataFrame({
    "MAE": {
        "Linear Regression": mae_lr,
        "KNN": mae_knn,
        "Decision Tree": mae_dt,
        "Random Forest": mae_rf,
        "Gradient Boosting": mae_gbr,
        "XGBoost": mae_xgb,
        "LightGBM": mae_lgbm,
        "MLP": mae_mlp

    },
    "RMSE": {
        "Linear Regression": rmse_lr,
        "KNN": rmse_knn,
        "Decision Tree": rmse_dt,
        "Random Forest": rmse_rf,
        "Gradient Boosting": rmse_gbr,
        "XGBoost": rmse_xgb,
        "LightGBM": rmse_lgbm,
        "MLP": rmse_mlp
    },
    "NRMSE": {
        "Linear Regression": nrmse_lr,
        "KNN": nrmse_knn,
        "Decision Tree": nrmse_dt,
        "Random Forest": nrmse_rf,
        "Gradient Boosting": nrmse_gbr,
        "XGBoost": nrmse_xgb,
        "LightGBM": nrmse_lgbm,
        "MLP": nrmse_mlp

    },
    "R2": {
        "Linear Regression": r2_lr,
        "KNN": r2_knn,
        "Decision Tree": r2_dt,
        "Random Forest": r2_rf,
        "Gradient Boosting": r2_gbr,
        "XGBoost": r2_xgb,
        "LightGBM": r2_lgbm,
        "MLP": r2_mlp

    }
})

results


Unnamed: 0,MAE,RMSE,NRMSE,R2
Linear Regression,5.454709e-14,1.157439e-13,4.0063660000000005e-17,1.0
KNN,0.0429427,0.4083383,0.0001413424,0.999996
Decision Tree,0.05800851,1.020432,0.0003532129,0.999976
Random Forest,0.01393337,0.5539807,0.0001917552,0.999993
Gradient Boosting,0.04912105,0.3341337,0.0001156572,0.999997
XGBoost,0.5797379,9.633088,0.003334402,0.997838
LightGBM,0.1768681,1.854185,0.0006418086,0.99992
MLP,0.02974392,0.06796672,2.352604e-05,1.0
