In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('../notebooks/data/test_energy_data.csv')

In [3]:
df.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,24563,15,4,28.52,Weekday,2865.57
1,Commercial,27583,56,23,23.07,Weekend,4283.8
2,Commercial,45313,4,44,33.56,Weekday,5067.83
3,Residential,41625,84,17,27.39,Weekend,4624.3
4,Residential,36720,58,47,17.08,Weekday,4820.59


In [4]:
df.dtypes

Building Type           object
Square Footage           int64
Number of Occupants      int64
Appliances Used          int64
Average Temperature    float64
Day of Week             object
Energy Consumption     float64
dtype: object

In [109]:
df.isnull().sum()

Building Type          0
Square Footage         0
Number of Occupants    0
Appliances Used        0
Average Temperature    0
Day of Week            0
Energy Consumption     0
dtype: int64

In [110]:
#no null data means the data is clean 
df.describe() 

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption
count,100.0,100.0,100.0,100.0,100.0
mean,25881.92,47.23,26.97,22.0433,4187.5783
std,13711.075264,29.905526,14.237846,6.957951,832.55985
min,1161.0,2.0,1.0,10.4,2351.97
25%,14161.0,21.0,16.75,15.6825,3621.925
50%,27582.5,47.0,27.5,21.97,4249.39
75%,38109.5,73.0,39.25,27.4925,4797.175
max,49354.0,99.0,49.0,34.71,6042.56


In [111]:
df.duplicated()
#no duplicate data means the data is clean

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Length: 100, dtype: bool

In [112]:
df['Building Type'].unique()

array(['Residential', 'Commercial', 'Industrial'], dtype=object)

In [113]:
X=df.drop(columns=['Energy Consumption'])
Y=df['Energy Consumption']

In [114]:
X

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week
0,Residential,24563,15,4,28.52,Weekday
1,Commercial,27583,56,23,23.07,Weekend
2,Commercial,45313,4,44,33.56,Weekday
3,Residential,41625,84,17,27.39,Weekend
4,Residential,36720,58,47,17.08,Weekday
...,...,...,...,...,...,...
95,Commercial,1161,81,11,15.45,Weekend
96,Residential,37943,50,23,21.73,Weekend
97,Commercial,1558,27,29,16.86,Weekend
98,Industrial,2145,56,12,11.77,Weekend


In [115]:
num_columns = X.select_dtypes(exclude=['object']).columns
cat_columns = X.select_dtypes(include=['object']).columns

In [136]:
cat_columns

Index(['Building Type', 'Day of Week'], dtype='object')

In [116]:
num_columns

Index(['Square Footage', 'Number of Occupants', 'Appliances Used',
       'Average Temperature'],
      dtype='object')

In [117]:
cat_columns

Index(['Building Type', 'Day of Week'], dtype='object')

In [118]:
build_columns = ['Residential','Commercial','Industrial']
day_columns = ['Weekday','Weekend']

In [119]:
from sklearn.compose import ColumnTransformer   
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer        
from sklearn.preprocessing import StandardScaler,OrdinalEncoder

In [120]:
num_col= Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])
cat_col= Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder(categories=[build_columns,day_columns])),
    ('scaler',StandardScaler())
])
preprocessor=ColumnTransformer(
    transformers=[
        ('num_col',num_col,num_columns),
        ('cat_col',cat_col,cat_columns)
    ]
)

In [121]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [122]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [123]:
X_train.head()
X_test.head()

Unnamed: 0,num_col__Square Footage,num_col__Number of Occupants,num_col__Appliances Used,num_col__Average Temperature,cat_col__Building Type,cat_col__Day of Week
0,1.608627,0.171442,0.439722,1.837371,1.204321,-1.193416
1,-1.266406,0.50273,0.729251,1.890138,-1.145573,0.837931
2,-1.414575,0.038926,1.67022,1.541872,1.204321,0.837931
3,-1.448677,1.629111,1.163544,-1.064852,-1.145573,0.837931
4,0.844264,-0.391749,0.439722,-1.658865,-1.145573,0.837931


In [124]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [125]:
linear=LinearRegression()
linear.fit(X_train,Y_train)

In [126]:
linear.intercept_

np.float64(4151.752375)

In [127]:
linear.coef_

array([680.30394623, 301.84930387, 276.30889786, -33.16500357,
       425.54819455, -24.61619159])

In [128]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

#Linear Regression
linear=LinearRegression()
linear.fit(X_train,Y_train)
y_pred=linear.predict(X_test)
mae, rmse, r2_square=evaluate_model(Y_test,y_pred)
mae,rmse,r2_square*100


(0.013548239607621327, np.float64(0.015809323127456216), 99.99999996252691)

In [129]:
#Lasso Model Regression
#L1 regularization for feature selection
lasso=Lasso()
lasso.fit(X_train,Y_train)
y_pred=lasso.predict(X_test)
mae, rmse, r2_square=evaluate_model(Y_test,y_pred)  
mae,rmse,r2_square*100

(2.1591062787992996, np.float64(2.514793988890798), 99.9990518047174)

In [130]:
#Ridge Model Regression regularization
#L2 regularization for OverFitting
ridge=Ridge()
ridge.fit(X_train,Y_train)
y_pred=ridge.predict(X_test)
mae, rmse, r2_square=evaluate_model(Y_test,y_pred)  
mae,rmse,r2_square*100

(10.906548622243054, np.float64(12.610050632166127), 99.97615888163655)

In [131]:
#elastic Model Regression both regularization
#L1 and L2
elastic=ElasticNet()
elastic.fit(X_train,Y_train)
y_pred=elastic.predict(X_test)
mae, rmse, r2_square=evaluate_model(Y_test,y_pred)  
mae,rmse,r2_square*100

(271.49820026020365, np.float64(315.01993392914), 85.12116131206248)

In [132]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [133]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,Y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(Y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 0.015809323127456216
MAE: 0.013548239607621327
R2 score 99.99999996252691


Lasso
Model Training Performance
RMSE: 2.514793988890798
MAE: 2.1591062787992996
R2 score 99.9990518047174


Ridge
Model Training Performance
RMSE: 12.610050632166127
MAE: 10.906548622243054
R2 score 99.97615888163655


Elasticnet
Model Training Performance
RMSE: 315.01993392914
MAE: 271.49820026020365
R2 score 85.12116131206248




In [134]:
#LinearRegression is the best mmodel for this dataset
#Model Training Performance
#RMSE: 0.015809323127456216
#MAE: 0.013548239607621327
#R2 score 99.99999996252691