# Model for predicting energy consumption

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder


In [10]:
df = pd.read_parquet(r"E:\BA-Project\processed_data\cleaned_data.parquet")
df.head()

Unnamed: 0,registryID,fuelTypeStd,energyMJq0,energyMJq2,energyMJq3,throughputTonneQ0,throughputTonneQ2,throughputTonneQ3,ghgsTonneCO2eQ0,ghgsTonneCO2eQ2,...,weeklyOpHoursLow_q3,weeklyOpHoursLow_q4,weeklyOpHours_q1,weeklyOpHours_q2,weeklyOpHours_q3,weeklyOpHours_q4,weeklyOpHoursHigh_q1,weeklyOpHoursHigh_q2,weeklyOpHoursHigh_q3,weeklyOpHoursHigh_q4
0,110041038098.0,diesel,0.000303,3.132848,5.819856,196.2375,419.0191,588.8033,2.1335e-08,0.00022,...,50.992,51.544,64.2,69.1,67.7,64.3,87.704,91.192,82.444,81.332
1,110070522664.0,naturalGas,0.000325,1.18416,7.86041,196.2375,419.0191,588.8033,1.634686e-08,6e-05,...,90.728,83.288,93.2,103.2,111.7,102.3,109.271996,123.78,132.672,121.312004
2,110000426913.0,naturalGas,0.000332,76.074302,76.522121,5.449402e-09,5.449402e-09,5.449402e-09,1.851187e-06,2e-06,...,43.591995,41.556,78.2,70.6,67.7,78.6,102.7,95.296,91.808,115.644
3,110042051089.0,naturalGas,0.000333,2797.245096,2874.676155,196.2375,419.0191,588.8033,1.674315e-08,0.140813,...,50.992,51.544,64.2,69.1,67.7,64.3,87.704,91.192,82.444,81.332
4,110000427538.0,lpgHGL,0.000346,76.382437,77.648,196.2375,419.0191,588.8033,5.426127e-06,5e-06,...,40.552002,42.796,54.9,52.7,52.9,52.4,69.012,64.264,65.248,62.004


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 574294 entries, 0 to 574295
Data columns (total 32 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   fuelTypeStd           574294 non-null  float64
 1   energyMJq0            574294 non-null  float64
 2   energyMJq2            574294 non-null  float64
 3   energyMJq3            574294 non-null  float64
 4   throughputTonneQ0     574294 non-null  float64
 5   throughputTonneQ2     574294 non-null  float64
 6   throughputTonneQ3     574294 non-null  float64
 7   ghgsTonneCO2eQ0       574294 non-null  float64
 8   ghgsTonneCO2eQ2       574294 non-null  float64
 9   ghgsTonneCO2eQ3       574294 non-null  float64
 10  unitTypeStd           574294 non-null  float64
 11  energyEstimateSource  574294 non-null  float64
 12  ghgsEstimateSource    574294 non-null  float64
 13  siteTypeName          574294 non-null  float64
 14  postalCode            574294 non-null  float64
 15  state

In [11]:
cat_cols = ['registryID', 'fuelTypeStd', 'unitTypeStd', 'energyEstimateSource',
            'ghgsEstimateSource', 'siteTypeName', 'name', 'locationAddress',
            'cityName', 'countyName', 'stateCode', 'naicsCode']

df[cat_cols] = df[cat_cols].astype(str)


In [12]:
cols_to_drop = [
    "registryID", "name", "locationAddress", "cityName", "countyName"
]

df = df.drop(columns=cols_to_drop)

In [13]:
low_cardinality = [
    "fuelTypeStd", "unitTypeStd", "energyEstimateSource",
    "ghgsEstimateSource", "siteTypeName", "stateCode"
]

high_cardinality = ["naicsCode"]


In [15]:
# Ordinal encode low-cardinality columns
ord_enc = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)
df[low_cardinality] = ord_enc.fit_transform(df[low_cardinality])

# Target encode NAICS
tgt_enc = TargetEncoder()
df[high_cardinality] = tgt_enc.fit_transform(df[high_cardinality], df["energyMJq2"])


In [17]:
X = df.drop(columns=['energyMJq2', 'energyMJq0', 'energyMJq3', 'ghgsTonneCO2eQ0', 'ghgsTonneCO2eQ2', 'ghgsTonneCO2eQ3'])
y = df['energyMJq2']


In [23]:
import numpy as np

# Check for infinite values
print("Infinite values per column:")
print(np.isinf(df.select_dtypes(include=[float, int])).sum())

# Check extremely large values
print("\nMax values per numeric column:")
print(df.select_dtypes(include=[float, int]).max())


Infinite values per column:
fuelTypeStd             0
energyMJq0              0
energyMJq2              0
energyMJq3              0
throughputTonneQ0       0
throughputTonneQ2       0
throughputTonneQ3       0
ghgsTonneCO2eQ0         0
ghgsTonneCO2eQ2         0
ghgsTonneCO2eQ3         0
unitTypeStd             0
energyEstimateSource    0
ghgsEstimateSource      0
siteTypeName            0
postalCode              0
stateCode               0
countyFIPS              0
latitude                0
longitude               0
naicsCode               0
weeklyOpHoursLow_q1     0
weeklyOpHoursLow_q2     0
weeklyOpHoursLow_q3     0
weeklyOpHoursLow_q4     0
weeklyOpHours_q1        0
weeklyOpHours_q2        0
weeklyOpHours_q3        0
weeklyOpHours_q4        0
weeklyOpHoursHigh_q1    0
weeklyOpHoursHigh_q2    0
weeklyOpHoursHigh_q3    0
weeklyOpHoursHigh_q4    0
dtype: int64

Max values per numeric column:
fuelTypeStd             9.000000e+00
energyMJq0              7.122873e+10
energyMJq2           

In [29]:
for col in num_cols:
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(upper=upper)


In [25]:
df = df.replace([np.inf, -np.inf], np.nan)
num_cols = df.select_dtypes(include=[float, int]).columns
df = df.dropna(subset=num_cols)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
df[num_cols] = df[num_cols].astype("float64")


In [34]:
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

ValueError: Input X contains infinity or a value too large for dtype('float32').