In [63]:
import numpy as np
import pandas as pd
import sklearn

sklearn.set_config(transform_output="pandas")

df = pd.read_csv("recs2020_public_v7.csv")
df[df["KWH"] > 6000]
base_cols = ["NCOMBATH", "TOTROOMS", "NHSLDMEM", "ATHOME", "KWH"]
device_cols = [
    "TVCOLOR",
    "OVEN",
    "COOKTOP",
    "MICRO",
    "ZFREEZER",
    "DESKTOP",
    "NUMLAPTOP",
    "DVD"
]
device_cols_in_df = [col for col in device_cols if col in df.columns]
df["NUM_DEVICES"] = df[device_cols_in_df].fillna(0).sum(axis=1)
model_df = df[base_cols + ["NUM_DEVICES"]]

features = ["NCOMBATH", "TOTROOMS", "NHSLDMEM", "ATHOME", "NUM_DEVICES"]

model_df.head()

Unnamed: 0,NCOMBATH,TOTROOMS,NHSLDMEM,ATHOME,KWH,NUM_DEVICES
0,3,8,2,0,12521.48,6
1,1,3,1,5,5243.05,3
2,1,4,1,3,2387.64,0
3,2,9,2,5,9275.07,4
4,1,3,2,5,5869.7,4


In [64]:
from sklearn.model_selection import train_test_split

y = model_df["KWH"]
df = model_df.drop(columns=["KWH"])

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.3, random_state=0
)

In [66]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression


def evaluate_model(
    y_train,
    y_test,
    y_pred_train,
    y_pred_test
) -> None:
    rmse_train = root_mean_squared_error(y_train, y_pred_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)

    rmse_test = root_mean_squared_error(y_test, y_pred_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)

    print(f"Train RMSE {rmse_train:.2f}, MAE: {mae_train:.2f}")
    print(f"Test  RMSE {rmse_test:.2f}, MAE: {mae_test:.2f}")

reg_linear = LinearRegression()
reg_linear.fit(X_train, y_train)

y_pred_train = reg_linear.predict(X_train)
y_pred_test = reg_linear.predict(X_test)
evaluate_model(y_train, y_test, y_pred_train, y_pred_test)

Train RMSE 5959.49, MAE: 4347.88
Test  RMSE 6369.59, MAE: 4387.47


In [56]:
new_data = pd.DataFrame({
    "NCOMBATH": [0.5],
    "TOTROOMS": [0.2],
    "NHSLDMEM": [0.5],
    "ATHOME": [0.5],
    "NUM_DEVICES": [0.14]
})
predicted_kwh = reg_linear.predict(new_data)
print(f"Przewidywane zużycie energii (KWH): {predicted_kwh[0]:.2f}")

Przewidywane zużycie energii (KWH): 2434.23
