In [47]:
# Import Library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import os
import pickle

# Read dataset
url = 'https://data.energystar.gov/resource/5xn2-dv4h.csv'
df = pd.read_csv(url)

df

Unnamed: 0,pd_id,brand_name,model_number,additional_model_information,upc,height_inches,width_inches,depth_inches,weight_lbs,cooling_capacity_btu_hour,...,primary_communication_module_device_brand_name_and_model_number,network_security_standards,network_standby_power_w,broadband_connection_needed_for_demand_response,direct_on_premises_open_standard_based_interconnection,date_available_on_market,date_certified,markets,energy_star_model_identifier,meets_most_efficient_criteria
0,2684132,GE Profile,PHNT12CCH1,,08469191798,12.8,19.7,28.0,80.0,12200,...,,,,,,2023-01-06T00:00:00.000,2023-10-26T00:00:00.000,United States,ES_1123206_PHNT12CCH1_10232023162410_537073,No
1,2765428,GE Profile,PWDV08W**#,,084691943297,13.6,18.5,19.0,43.2,8000,...,,,,,,2024-02-15T00:00:00.000,2023-11-23T00:00:00.000,United States,ES_1123206_PWDV08W**#_11202023141819_9556390,No
2,2765429,GE Profile,PWDV10W**#,,084691943303,13.6,18.5,19.0,45.1,10000,...,,,,,,2024-02-15T00:00:00.000,2023-11-23T00:00:00.000,United States,ES_1123206_PWDV10W**#_11202023142852_6974603,No
3,2765430,GE Profile,PWDV12W**#,,084691943310,14.7,19.0,21.7,49.9,12000,...,,,,,,2024-02-15T00:00:00.000,2023-11-23T00:00:00.000,United States,ES_1123206_PWDV12W**#_11202023143404_8143121,No
4,2765431,GE Profile,PWDV14W**#,,084691943327,14.7,19.0,21.7,55.3,14000,...,,,,,,2024-02-15T00:00:00.000,2023-11-23T00:00:00.000,United States,ES_1123206_PWDV14W**#_11202023143906_5548059,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,2703724,Vissani,VAWA12V4HWT,,687636008251,14.7,21.7,19.0,50.9,12000,...,,,,,,2023-10-31T00:00:00.000,2023-10-31T00:00:00.000,"United States, Canada",ES_0031912_VAWA12V4HWT_10312023171353_80188017,No
123,2703725,Vissani,VAWA14V4HWT,,687636008268,14.7,21.7,19.0,56.0,14000,...,,,,,,2023-10-31T00:00:00.000,2023-10-31T00:00:00.000,"United States, Canada",ES_0031912_VAWA14V4HWT_10312023171353_80188017,No
124,3425395,Windmill,08W2Wi,,860003069523,13.3,19.3,22.5,61.0,8000,...,,,,,,2024-03-12T00:00:00.000,2024-03-28T00:00:00.000,United States,ES_1148066_08W2Wi_03262024095555_980493,No
125,3425396,Windmill,10W2Wi,,860003069530,13.3,19.3,22.5,63.0,10000,...,,,,,,2024-03-12T00:00:00.000,2024-03-28T00:00:00.000,United States,ES_1148066_10W2Wi_03262024104916_9999811,No


In [48]:
df.columns

Index(['pd_id', 'brand_name', 'model_number', 'additional_model_information',
       'upc', 'height_inches', 'width_inches', 'depth_inches', 'weight_lbs',
       'cooling_capacity_btu_hour', 'voltage_volts', 'type',
       'installation_mounting_type', 'support_bracket', 'heating_mode',
       'casement_window', 'product_class', 'variable_speed_compressor',
       'low_noise', 'refrigerant_type', 'refrigerant_with_gwp',
       'combined_energy_efficiency_ratio_ceer',
       'percent_less_energy_use_than_us_federal_standard',
       'annual_energy_use_kwh_yr', 'connected_capable', 'connects_using',
       'communication_hardware_architecture', 'dr_protocol',
       'primary_communication_module_device_brand_name_and_model_number',
       'network_security_standards', 'network_standby_power_w',
       'broadband_connection_needed_for_demand_response',
       'direct_on_premises_open_standard_based_interconnection',
       'date_available_on_market', 'date_certified', 'markets',
       'e

In [49]:
# Fitur-fitur yang mungkin tidak penting untuk prediksi kWh/yr

irrelevant_features = [
    'pd_id', 'brand_name', 'model_number', 'additional_model_information',
    'upc', 'support_bracket', 'product_class', 'connected_capable', 'connects_using',
    'communication_hardware_architecture', 'dr_protocol',
    'primary_communication_module_device_brand_name_and_model_number', 'network_security_standards',
    'network_standby_power_w', 'broadband_connection_needed_for_demand_response',
    'direct_on_premises_open_standard_based_interconnection', 'date_available_on_market', 'date_certified',
    'markets', 'energy_star_model_identifier', 'meets_most_efficient_criteria'
]



# Menghapus fitur-fitur yang tidak relevan
df = df.drop(columns=irrelevant_features)

# Menghapus kolom yang seluruh isinya adalah NaN
df = df.dropna(axis=1, how='all')

df

Unnamed: 0,height_inches,width_inches,depth_inches,weight_lbs,cooling_capacity_btu_hour,voltage_volts,type,installation_mounting_type,heating_mode,variable_speed_compressor,low_noise,refrigerant_type,refrigerant_with_gwp,combined_energy_efficiency_ratio_ceer,percent_less_energy_use_than_us_federal_standard,annual_energy_use_kwh_yr
0,12.8,19.7,28.0,80.0,12200,115,Window,Straddles Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),14.7,35,622.4
1,13.6,18.5,19.0,43.2,8000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,38,400.0
2,13.6,18.5,19.0,45.1,10000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,38,500.0
3,14.7,19.0,21.7,49.9,12000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,38,600.0
4,14.7,19.0,21.7,55.3,14000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,40,700.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,14.7,21.7,19.0,50.9,12000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,38,600.0
123,14.7,21.7,19.0,56.0,14000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,40,700.0
124,13.3,19.3,22.5,61.0,8000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,38,400.0
125,13.3,19.3,22.5,63.0,10000,115,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP),15.0,38,500.0


In [50]:
df.columns

Index(['height_inches', 'width_inches', 'depth_inches', 'weight_lbs',
       'cooling_capacity_btu_hour', 'voltage_volts', 'type',
       'installation_mounting_type', 'heating_mode',
       'variable_speed_compressor', 'low_noise', 'refrigerant_type',
       'refrigerant_with_gwp', 'combined_energy_efficiency_ratio_ceer',
       'percent_less_energy_use_than_us_federal_standard',
       'annual_energy_use_kwh_yr'],
      dtype='object')

In [51]:
# Memisahkan kolom numerik dan kategorikal
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Scaling fitur numerik menggunakan StandardScaler
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Memisahkan dataset menjadi numerical_df dan categorical_df
numerical_df = df[numerical_cols]
categorical_df = df.drop(columns=numerical_cols)

# Menggabungkan kembali data yang sudah diproses
df_processed = pd.concat([numerical_df, categorical_df], axis=1)

In [52]:
df_processed

Unnamed: 0,height_inches,width_inches,depth_inches,weight_lbs,cooling_capacity_btu_hour,voltage_volts,combined_energy_efficiency_ratio_ceer,percent_less_energy_use_than_us_federal_standard,annual_energy_use_kwh_yr,type,installation_mounting_type,heating_mode,variable_speed_compressor,low_noise,refrigerant_type,refrigerant_with_gwp
0,-1.049677,-0.175190,2.009126,0.982694,0.191065,-0.337696,-0.349099,-1.153130,0.185442,Window,Straddles Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
1,-0.431433,-0.651107,-1.120379,-1.053988,-0.901263,-0.337696,0.234264,-0.321042,-0.845310,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
2,-0.431433,-0.651107,-1.120379,-0.948833,-0.381107,-0.337696,0.234264,-0.321042,-0.381842,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
3,0.418654,-0.452808,-0.181528,-0.683179,0.139050,-0.337696,0.234264,-0.321042,0.081625,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
4,0.418654,-0.452808,-0.181528,-0.384318,0.659206,-0.337696,0.234264,0.233684,0.545092,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,0.418654,0.618005,-1.120379,-0.627834,0.139050,-0.337696,0.234264,-0.321042,0.081625,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
123,0.418654,0.618005,-1.120379,-0.345577,0.659206,-0.337696,0.234264,0.233684,0.545092,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
124,-0.663274,-0.333829,0.096651,-0.068854,-0.901263,-0.337696,0.234264,-0.321042,-0.845310,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)
125,-0.663274,-0.333829,0.096651,0.041835,-0.381107,-0.337696,0.234264,-0.321042,-0.381842,Window,Does Not Straddle Window or Windowsill,No,Yes,No,R-32,R-32 (GWP:675 | Lower GWP)


In [53]:
# Memisahkan fitur (X) dan label (y)
X = df_processed.drop(columns=['annual_energy_use_kwh_yr'])
y = df_processed['annual_energy_use_kwh_yr']

# Pembagian dataset menjadi data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Tampilkan ukuran data train dan test
print(f"Ukuran data train: {X_train.shape}, {y_train.shape}")
print(f"Ukuran data test: {X_test.shape}, {y_test.shape}")

Ukuran data train: (114, 15), (114,)
Ukuran data test: (13, 15), (13,)


In [54]:
# Inisialisasi model Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

# Melatih model Decision Tree Regressor
dt_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_dt = dt_regressor.predict(X_test)

# Evaluasi model Decision Tree Regressor
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Menyimpan hasil evaluasi ke dalam DataFrame
dt_regression_results = pd.DataFrame({
    "Model": ["Decision Tree Regressor"],
    "Mean Squared Error": [mse_dt],
    "R^2 Score": [r2_dt],
})

# Menampilkan hasil evaluasi regresi
print("\n", dt_regression_results)

ValueError: could not convert string to float: 'Window'

In [None]:
# Inisialisasi model Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Melatih model Random Forest Regressor
rf_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_rf = rf_regressor.predict(X_test)

# Evaluasi model Random Forest Regressor
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Menyimpan hasil evaluasi ke dalam DataFrame
rf_regression_results = pd.DataFrame({
    "Model": ["Random Forest Regressor"],
    "Mean Squared Error": [mse_rf],
    "R^2 Score": [r2_rf],
})

# Menampilkan hasil evaluasi regresi
print("\n", rf_regression_results)


                      Model  Mean Squared Error  R^2 Score
0  Random Forest Regressor            0.001076   0.998861


In [None]:
# Inisialisasi model Linear Regression
lr_regressor = LinearRegression()

# Melatih model Linear Regression
lr_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_lr = lr_regressor.predict(X_test)

# Evaluasi model Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Menyimpan hasil evaluasi ke dalam DataFrame
lr_regression_results = pd.DataFrame({
    "Model": ["Linear Regression"],
    "Mean Squared Error": [mse_lr],
    "R^2 Score": [r2_lr],
})

# Menampilkan hasil evaluasi regresi
print("\n", lr_regression_results)


                Model  Mean Squared Error  R^2 Score
0  Linear Regression            0.000055   0.999941


In [None]:
# Path untuk menyimpan model pickle
model_path = '../../../app/models-pickle/house-energy/air-conditioners.pkl'

# Pastikan direktori untuk menyimpan model sudah ada
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Simpan model ke file .pkl
with open(model_path, 'wb') as file:
    pickle.dump(lr_regressor, file)

print(f'Model Linear Regression disimpan ke {model_path}')

Model Linear Regression disimpan ke ../../../app/models-pickle/house-energy/air-conditioners.pkl
