In [1]:
# Import packages

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

In [2]:
df = pd.read_csv('data/df_regression.csv')

In [5]:
df.shape

(388, 9)

In [6]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_percentage

distance        0.000000
consume         0.000000
speed           0.000000
temp_inside     3.092784
temp_outside    0.000000
AC              0.000000
rain            0.000000
sun             0.000000
gas_SP98        0.000000
dtype: float64

In [7]:
# We replace the missing values of the variable temp_inside with its mean value

average_temp_inside = df['temp_inside'].mean()
df['temp_inside'].fillna(average_temp_inside, inplace=True)

In [11]:
# Split the dataframe

from sklearn.model_selection import train_test_split

X = df.drop('consume', axis=1)  # Features
y = df['consume']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Normalize the dataframe

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [13]:
# Linear regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create a Linear Regression model
regression_model = LinearRegression()

# Fit the model on the training data
regression_model.fit(X_train_normalized, y_train)

# Make predictions on the test data
y_pred = regression_model.predict(X_test_normalized)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 0.8216359860810027
R-squared (R2): 0.09442245281893835


In [19]:
# Decision tree

from sklearn.tree import DecisionTreeRegressor

# Create a Decision Tree Regressor
tree_regressor = DecisionTreeRegressor()

# Fit the model to the training data
tree_regressor.fit(X_train_normalized, y_train)

# Make predictions on the test data
tree_predictions = tree_regressor.predict(X_test_normalized)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate predictions
tree_predictions = tree_regressor.predict(X_test_normalized)

# Calculate MSE, RMSE, and R2 score
tree_mse = mean_squared_error(y_test, tree_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_r2 = r2_score(y_test, tree_predictions)

print("Decision Tree Regression Metrics:")
print(f"Mean Squared Error (MSE): {tree_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {tree_rmse:.2f}")
print(f"R-squared (R2) Score: {tree_r2:.2f}")

Decision Tree Regression Metrics:
Mean Squared Error (MSE): 0.83
Root Mean Squared Error (RMSE): 0.91
R-squared (R2) Score: 0.08


In [36]:
# Random forest

from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Regressor
forest_regressor = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model to the training data
forest_regressor.fit(X_train_normalized, y_train)

# Make predictions on the test data
forest_predictions = forest_regressor.predict(X_test_normalized)

# Calculate predictions
forest_predictions = forest_regressor.predict(X_test_normalized)

# Calculate MSE, RMSE, and R2 score
forest_mse = mean_squared_error(y_test, forest_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_r2 = r2_score(y_test, forest_predictions)

print("Random Forest Regression Metrics:")
print(f"Mean Squared Error (MSE): {forest_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {forest_rmse:.2f}")
print(f"R-squared (R2) Score: {forest_r2:.2f}")

Random Forest Regression Metrics:
Mean Squared Error (MSE): 0.42
Root Mean Squared Error (RMSE): 0.65
R-squared (R2) Score: 0.53


In [17]:
# XGBoost

import xgboost as xgb

# Create an XGBoost Regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Fit the model to the training data
xgb_regressor.fit(X_train_normalized, y_train)

# Make predictions on the test data
xgb_predictions = xgb_regressor.predict(X_test_normalized)

# Calculate predictions
xgb_predictions = xgb_regressor.predict(X_test_normalized)

# Calculate MSE, RMSE, and R2 score
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_predictions)

print("XGBoost Regression Metrics:")
print(f"Mean Squared Error (MSE): {xgb_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {xgb_rmse:.2f}")
print(f"R-squared (R2) Score: {xgb_r2:.2f}")

XGBoost Regression Metrics:
Mean Squared Error (MSE): 0.52
Root Mean Squared Error (RMSE): 0.72
R-squared (R2) Score: 0.43


In [18]:
# K-Nearest Neighbors

from sklearn.neighbors import KNeighborsRegressor

# Create a K-Nearest Neighbors Regressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)

# Fit the model to the training data
knn_regressor.fit(X_train_normalized, y_train)

# Make predictions on the test data
knn_predictions = knn_regressor.predict(X_test_normalized)

# Calculate predictions
knn_predictions = knn_regressor.predict(X_test_normalized)

# Calculate MSE, RMSE, and R2 score
knn_mse = mean_squared_error(y_test, knn_predictions)
knn_rmse = np.sqrt(knn_mse)
knn_r2 = r2_score(y_test, knn_predictions)

print("K-Nearest Neighbors (KNN) Regression Metrics:")
print(f"Mean Squared Error (MSE): {knn_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {knn_rmse:.2f}")
print(f"R-squared (R2) Score: {knn_r2:.2f}")

K-Nearest Neighbors (KNN) Regression Metrics:
Mean Squared Error (MSE): 0.84
Root Mean Squared Error (RMSE): 0.92
R-squared (R2) Score: 0.07


The random forest model seems to be the best performer, with the lowest MSE, RMSE and the highest R-squared score, even though the performance is still low.