In [7]:
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
# Load the CSV file
simulated_data = pd.read_csv("simulated_energy_data.csv")

# Verify the data
print(simulated_data.head())

         Date  Energy_Consumption_kWh  Temperature_C  Electricity_Price_€/kWh
0  2023-01-01               14.363503      10.444106                 0.165706
1  2023-01-02               28.767858      33.447623                 0.425830
2  2023-01-03               23.299849      31.214026                 0.366079
3  2023-01-04               19.966462       2.831645                 0.309226
4  2023-01-05                8.900466      -2.225548                 0.243532


In [9]:
simulated_data.isnull().sum()

Date                       0
Energy_Consumption_kWh     0
Temperature_C              0
Electricity_Price_€/kWh    0
dtype: int64

In [12]:
# Select only numeric columns for correlation
numeric_columns = simulated_data.select_dtypes(include=["float64", "int64"])
correlation_matrix = numeric_columns.corr()

# Display the correlation matrix
print(correlation_matrix)

                         Energy_Consumption_kWh  Temperature_C  \
Energy_Consumption_kWh                 1.000000       0.006111   
Temperature_C                          0.006111       1.000000   
Electricity_Price_€/kWh                0.021659      -0.050211   

                         Electricity_Price_€/kWh  
Energy_Consumption_kWh                  0.021659  
Temperature_C                          -0.050211  
Electricity_Price_€/kWh                 1.000000  


In [13]:
# Convert 'Date' to datetime format
simulated_data["Date"] = pd.to_datetime(simulated_data["Date"])

# Extract 'Month' and 'Weekday' features
simulated_data["Month"] = simulated_data["Date"].dt.month
simulated_data["Weekday"] = simulated_data["Date"].dt.weekday

# Check the updated DataFrame
print(simulated_data.head())


        Date  Energy_Consumption_kWh  Temperature_C  Electricity_Price_€/kWh  \
0 2023-01-01               14.363503      10.444106                 0.165706   
1 2023-01-02               28.767858      33.447623                 0.425830   
2 2023-01-03               23.299849      31.214026                 0.366079   
3 2023-01-04               19.966462       2.831645                 0.309226   
4 2023-01-05                8.900466      -2.225548                 0.243532   

   Month  Weekday  
0      1        6  
1      1        0  
2      1        1  
3      1        2  
4      1        3  


In [14]:
# Add lag features (previous day's consumption)
simulated_data["Lag_1"] = simulated_data["Energy_Consumption_kWh"].shift(1)

# Check for NaN values after adding lag feature
print(simulated_data.head())


        Date  Energy_Consumption_kWh  Temperature_C  Electricity_Price_€/kWh  \
0 2023-01-01               14.363503      10.444106                 0.165706   
1 2023-01-02               28.767858      33.447623                 0.425830   
2 2023-01-03               23.299849      31.214026                 0.366079   
3 2023-01-04               19.966462       2.831645                 0.309226   
4 2023-01-05                8.900466      -2.225548                 0.243532   

   Month  Weekday      Lag_1  
0      1        6        NaN  
1      1        0  14.363503  
2      1        1  28.767858  
3      1        2  23.299849  
4      1        3  19.966462  


In [15]:
from sklearn.model_selection import train_test_split

# Define features and target
features = ["Temperature_C", "Electricity_Price_€/kWh", "Month", "Weekday", "Lag_1"]  # Select relevant features
target = "Energy_Consumption_kWh"  # The column to predict

# Drop NaN values introduced by lag features
simulated_data = simulated_data.dropna()

# Split the data
X = simulated_data[features]
y = simulated_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the splits
print(f"Train set size: {X_train.shape}, Test set size: {X_test.shape}")


Train set size: (291, 5), Test set size: (73, 5)


In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")


Mean Absolute Error (MAE): 6.79475326107562
Mean Squared Error (MSE): 57.73000283963881


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_y_pred = rf_model.predict(X_test)

# Evaluate the model
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_mse = mean_squared_error(y_test, rf_y_pred)
print(f"Random Forest MAE: {rf_mae}")
print(f"Random Forest MSE: {rf_mse}")

Random Forest MAE: 6.7166111026135615
Random Forest MSE: 57.43476586593884


In [18]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf_model, "energy_consumption_model.pkl")

# Check if the model file is created
print("Model saved as 'energy_consumption_model.pkl'")


Model saved as 'energy_consumption_model.pkl'


In [22]:
new_data = [[15.0, 0.25, 7, 3, 20.0]]  # Example input

In [23]:
# Convert new_data to a DataFrame with feature names
new_data_df = pd.DataFrame(new_data, columns=["Temperature_C", "Electricity_Price_€/kWh", "Month", "Weekday", "Lag_1"])

# Make predictions
prediction = loaded_model.predict(new_data_df)
print(f"Predicted Energy Consumption: {prediction[0]} kWh")


Predicted Energy Consumption: 17.076237105844108 kWh
