In [1]:
# Importing the required libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
df = pd.read_csv('train_energy_data.csv')
df

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,7063,76,10,29.84,Weekday,2713.95
1,Commercial,44372,66,45,16.72,Weekday,5744.99
2,Industrial,19255,37,17,14.30,Weekend,4101.24
3,Residential,13265,14,41,32.82,Weekday,3009.14
4,Commercial,13375,26,18,11.92,Weekday,3279.17
...,...,...,...,...,...,...,...
995,Residential,14419,68,44,23.95,Weekend,3661.21
996,Industrial,12194,7,22,14.67,Weekend,3546.34
997,Commercial,39562,88,20,32.18,Weekday,5147.21
998,Residential,8348,67,37,16.48,Weekend,3244.98


In [3]:
df.describe()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,25462.388,48.372,25.606,22.61139,4166.25257
std,14294.554,29.061972,14.105166,7.139943,933.313064
min,560.0,1.0,1.0,10.05,1683.95
25%,13169.75,22.0,13.0,16.475,3509.4825
50%,25477.0,47.0,26.0,22.815,4175.73
75%,37446.25,73.25,38.0,28.85,4863.85
max,49997.0,99.0,49.0,34.99,6530.6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       1000 non-null   int64  
 2   Number of Occupants  1000 non-null   int64  
 3   Appliances Used      1000 non-null   int64  
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 54.8+ KB


# Data Preprocessing


In [5]:
label_encoder = LabelEncoder()

In [6]:
# Encode 'Building Type' and 'Day of Week'
df['Building Type'] = label_encoder.fit_transform(df['Building Type'])
df['Day of Week'] = label_encoder.fit_transform(df['Day of Week'])


In [7]:
numerical_columns = ['Square Footage', 'Number of Occupants', 'Appliances Used', 'Average Temperature', 'Building Type', 'Day of Week']
data = df[numerical_columns]

In [8]:
data.head()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Building Type,Day of Week
0,7063,76,10,29.84,2,0
1,44372,66,45,16.72,0,0
2,19255,37,17,14.3,1,1
3,13265,14,41,32.82,2,0
4,13375,26,18,11.92,0,0


In [9]:
# Define target variable
target = df['Energy Consumption']

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [11]:
model = LinearRegression()
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [13]:
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)


Mean Squared Error (MSE): 126059.23184229508
R-squared (R2): 0.8451931910306251


In [14]:
with open('energy_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)