In [13]:
# Import Library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import os
import pickle

# Read dataset
url = 'https://data.energystar.gov/resource/5xn2-dv4h.csv'
df = pd.read_csv(url)

df

In [None]:
df.columns

Index(['pd_id', 'brand_name', 'model_number', 'additional_model_information',
       'upc', 'height_inches', 'width_inches', 'depth_inches', 'weight_lbs',
       'cooling_capacity_btu_hour', 'voltage_volts', 'type',
       'installation_mounting_type', 'support_bracket', 'heating_mode',
       'casement_window', 'product_class', 'variable_speed_compressor',
       'low_noise', 'refrigerant_type', 'refrigerant_with_gwp',
       'combined_energy_efficiency_ratio_ceer',
       'percent_less_energy_use_than_us_federal_standard',
       'annual_energy_use_kwh_yr', 'connected_capable', 'connects_using',
       'communication_hardware_architecture', 'dr_protocol',
       'primary_communication_module_device_brand_name_and_model_number',
       'network_security_standards', 'network_standby_power_w',
       'broadband_connection_needed_for_demand_response',
       'direct_on_premises_open_standard_based_interconnection',
       'date_available_on_market', 'date_certified', 'markets',
       'e

In [None]:
# Fitur-fitur yang mungkin tidak penting untuk prediksi kWh/yr

irrelevant_features = [
    'pd_id', 'brand_name', 'model_number', 'additional_model_information',
    'upc', 'support_bracket', 'product_class', 'connected_capable', 'connects_using',
    'communication_hardware_architecture', 'dr_protocol',
    'primary_communication_module_device_brand_name_and_model_number', 'network_security_standards',
    'network_standby_power_w', 'broadband_connection_needed_for_demand_response',
    'direct_on_premises_open_standard_based_interconnection', 'date_available_on_market', 'date_certified',
    'markets', 'energy_star_model_identifier', 'meets_most_efficient_criteria', 'percent_less_energy_use_than_us_federal_standard', 'refrigerant_type',
       'refrigerant_with_gwp', 'combined_energy_efficiency_ratio_ceer', 'variable_speed_compressor'
]



# Menghapus fitur-fitur yang tidak relevan
df = df.drop(columns=irrelevant_features)

# Menghapus kolom yang seluruh isinya adalah NaN
df = df.dropna(axis=1, how='all')

df

Unnamed: 0,height_inches,width_inches,depth_inches,weight_lbs,cooling_capacity_btu_hour,voltage_volts,type,installation_mounting_type,heating_mode,low_noise,annual_energy_use_kwh_yr
0,12.8,19.7,28.0,80.0,12200,115,Window,Straddles Windowsill,No,No,622.4
1,13.6,18.5,19.0,43.2,8000,115,Window,Does Not Straddle Window or Windowsill,No,No,400.0
2,13.6,18.5,19.0,45.1,10000,115,Window,Does Not Straddle Window or Windowsill,No,No,500.0
3,14.7,19.0,21.7,49.9,12000,115,Window,Does Not Straddle Window or Windowsill,No,No,600.0
4,14.7,19.0,21.7,55.3,14000,115,Window,Does Not Straddle Window or Windowsill,No,No,700.0
...,...,...,...,...,...,...,...,...,...,...,...
122,14.7,21.7,19.0,50.9,12000,115,Window,Does Not Straddle Window or Windowsill,No,No,600.0
123,14.7,21.7,19.0,56.0,14000,115,Window,Does Not Straddle Window or Windowsill,No,No,700.0
124,13.3,19.3,22.5,61.0,8000,115,Window,Does Not Straddle Window or Windowsill,No,No,400.0
125,13.3,19.3,22.5,63.0,10000,115,Window,Does Not Straddle Window or Windowsill,No,No,500.0


In [None]:
df.columns

Index(['height_inches', 'width_inches', 'depth_inches', 'weight_lbs',
       'cooling_capacity_btu_hour', 'voltage_volts', 'type',
       'installation_mounting_type', 'heating_mode', 'low_noise',
       'annual_energy_use_kwh_yr'],
      dtype='object')

In [None]:
# Memisahkan kolom numerik dan kategorikal
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Memisahkan dataset menjadi numerical_df dan categorical_df
numerical_df = df[numerical_cols]
categorical_df = df.drop(columns=numerical_cols)

# Menggabungkan kembali data yang sudah diproses
df_processed = pd.concat([numerical_df, categorical_df], axis=1)

In [None]:
df_processed

Unnamed: 0,height_inches,width_inches,depth_inches,weight_lbs,cooling_capacity_btu_hour,voltage_volts,annual_energy_use_kwh_yr,type,installation_mounting_type,heating_mode,low_noise
0,-1.049677,-0.175190,2.009126,0.982694,0.191065,-0.337696,0.185442,Window,Straddles Windowsill,No,No
1,-0.431433,-0.651107,-1.120379,-1.053988,-0.901263,-0.337696,-0.845310,Window,Does Not Straddle Window or Windowsill,No,No
2,-0.431433,-0.651107,-1.120379,-0.948833,-0.381107,-0.337696,-0.381842,Window,Does Not Straddle Window or Windowsill,No,No
3,0.418654,-0.452808,-0.181528,-0.683179,0.139050,-0.337696,0.081625,Window,Does Not Straddle Window or Windowsill,No,No
4,0.418654,-0.452808,-0.181528,-0.384318,0.659206,-0.337696,0.545092,Window,Does Not Straddle Window or Windowsill,No,No
...,...,...,...,...,...,...,...,...,...,...,...
122,0.418654,0.618005,-1.120379,-0.627834,0.139050,-0.337696,0.081625,Window,Does Not Straddle Window or Windowsill,No,No
123,0.418654,0.618005,-1.120379,-0.345577,0.659206,-0.337696,0.545092,Window,Does Not Straddle Window or Windowsill,No,No
124,-0.663274,-0.333829,0.096651,-0.068854,-0.901263,-0.337696,-0.845310,Window,Does Not Straddle Window or Windowsill,No,No
125,-0.663274,-0.333829,0.096651,0.041835,-0.381107,-0.337696,-0.381842,Window,Does Not Straddle Window or Windowsill,No,No


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Memisahkan fitur (X) dan label (y) dari df yang diproses
X = df.drop(columns=['annual_energy_use_kwh_yr'])
y = df['annual_energy_use_kwh_yr']

# Tentukan kolom numerik dan kategorikal
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Pembagian dataset menjadi data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Gabungkan transformer untuk numerical dan categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ])




In [None]:
df.columns

Index(['height_inches', 'width_inches', 'depth_inches', 'weight_lbs',
       'cooling_capacity_btu_hour', 'voltage_volts', 'type',
       'installation_mounting_type', 'heating_mode', 'low_noise',
       'annual_energy_use_kwh_yr'],
      dtype='object')

In [None]:
# Inisialisasi model Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

# Gabungkan preprocessor dengan model dalam satu pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', dt_regressor)])

# Latih model
pipeline.fit(X_train, y_train)

# Evaluasi model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')


Mean Squared Error: 0.765101179905741
R2 Score: 0.19020544381289928


In [None]:
df.columns

Index(['height_inches', 'width_inches', 'depth_inches', 'weight_lbs',
       'cooling_capacity_btu_hour', 'voltage_volts', 'type',
       'installation_mounting_type', 'heating_mode', 'low_noise',
       'annual_energy_use_kwh_yr'],
      dtype='object')

In [None]:
df

Unnamed: 0,height_inches,width_inches,depth_inches,weight_lbs,cooling_capacity_btu_hour,voltage_volts,type,installation_mounting_type,heating_mode,low_noise,annual_energy_use_kwh_yr
0,-1.049677,-0.175190,2.009126,0.982694,0.191065,-0.337696,Window,Straddles Windowsill,No,No,0.185442
1,-0.431433,-0.651107,-1.120379,-1.053988,-0.901263,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,-0.845310
2,-0.431433,-0.651107,-1.120379,-0.948833,-0.381107,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,-0.381842
3,0.418654,-0.452808,-0.181528,-0.683179,0.139050,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,0.081625
4,0.418654,-0.452808,-0.181528,-0.384318,0.659206,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,0.545092
...,...,...,...,...,...,...,...,...,...,...,...
122,0.418654,0.618005,-1.120379,-0.627834,0.139050,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,0.081625
123,0.418654,0.618005,-1.120379,-0.345577,0.659206,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,0.545092
124,-0.663274,-0.333829,0.096651,-0.068854,-0.901263,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,-0.845310
125,-0.663274,-0.333829,0.096651,0.041835,-0.381107,-0.337696,Window,Does Not Straddle Window or Windowsill,No,No,-0.381842


In [None]:
# Path untuk menyimpan model pickle
model_path = '../../../app/models-pickle/house-energy/air-conditioners.pkl'

# Pastikan direktori untuk menyimpan model sudah ada
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Simpan pipeline (termasuk preprocessor dan regressor) ke file .pkl
with open(model_path, 'wb') as file:
    pickle.dump(pipeline, file)

print(f'Model Decision Tree Regressor disimpan ke {model_path}')

Model Decision Tree Regressor disimpan ke ../../../app/models-pickle/house-energy/air-conditioners.pkl
