In [None]:
# Import Library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Read Dataset
url = "https://raw.githubusercontent.com/josgiv/home-appliance-dataset/master/Dishwashers.csv"
df = pd.read_csv(url)

df

Unnamed: 0,ENERGY STAR Unique ID,Brand Name,Model Number,Additional Model Information,UPC,Type,Width (inches),Depth (inches),Capacity - Maximum Number of Place Settings,Soil-Sensing Capability,...,Connected Capable,Connects Using,Communication Hardware Architecture,DR Protocol,Direct on-premises Open-standard Based Interconnection,Date Available On Market,Date Certified,Markets,CB Model Identifier,Meets ENERGY STAR Most Efficient 2024 Criteria
0,2732683,Dacor,DDW24G9000AP,,887276819853,Standard,23.5,22.8,14,Yes,...,No,,,,,03/22/2024,11/14/2023,"United States, Canada",ES_1016518_DDW24G9000AP_11142023105452_80188705,Yes
1,2557713,Fisher & Paykel,DD24DTX6I1,",,Double drawer tall tub, ss tub, integrated",822843821624,Standard,23.6,22.5,14,Yes,...,No,,,,,04/14/2022,07/12/2023,"United States, Canada",ES_31708_DD24DTX6I1_052620211017763_6070550,Yes
2,2557712,Fisher & Paykel,DD24DTX6PX1,",,Double drawer tall tub, ss tub, classic, ss ...",822843823970,Standard,23.6,22.5,14,Yes,...,No,,,,,04/14/2022,05/14/2024,"United States, Canada",ES_31708_DD24DTX6PX1_012720221807443_3125940,Yes
3,2403740,Asko,DBI364*,,3838782650635,Standard,24.0,21.8,16,No,...,No,,,,,02/01/2023,10/21/2022,"United States, Canada",ES_1123023_DBI364*_10272022115247_80145987,Yes
4,2403737,Asko,DBI564*,,3838782628504;3838782639548;3838782641725;3838...,Standard,24.0,21.8,16,No,...,No,,,,,02/01/2023,10/21/2022,"United States, Canada",ES_1123023_DBI564*_10272022115247_80145987,Yes
5,2403735,Asko,DBI565*,,3838782627484;3838782628443;3838782639074;3838...,Standard,24.0,21.8,16,No,...,No,,,,,02/01/2023,10/21/2022,"United States, Canada",ES_1123023_DBI565*_10272022115247_80145987,Yes
6,2403743,Asko,DBI776*,,3838782650659,Standard,24.0,21.8,17,No,...,No,,,,,02/01/2023,10/21/2022,"United States, Canada",ES_1123023_DBI776*_10272022115247_80145987,Yes
7,2403738,Asko,DFI564*,,3838782627453;3838782627491,Standard,24.0,21.8,16,No,...,No,,,,,02/01/2023,10/21/2022,"United States, Canada",ES_1123023_DFI564*_10272022115247_80145987,Yes
8,2403736,Asko,DFI565*,,3838782628436;3838782628542;3838782647833,Standard,24.0,21.8,16,No,...,No,,,,,02/01/2023,10/21/2022,"United States, Canada",ES_1123023_DFI565*_10272022115247_80145987,Yes
9,2403742,Asko,DFI776*,,3838782634703,Standard,24.0,21.8,17,No,...,No,,,,,02/01/2023,10/21/2022,"United States, Canada",ES_1123023_DFI776*_10272022115247_80145987,Yes


In [None]:
# Fitur-fitur yang mungkin tidak penting untuk prediksi kWh/yr
irrelevant_features = ['ENERGY STAR Unique ID', 'Brand Name', 'Model Number',
                       'Additional Model Information', 'UPC', 'CB Model Identifier',
                       'Date Available On Market', 'Markets',
                       'Meets ENERGY STAR Most Efficient 2024 Criteria',
                       'Connected Capable', 'Connects Using', 'Communication Hardware Architecture',
                       'DR Protocol', 'Direct on-premises Open-standard Based Interconnection']

# Menghapus fitur-fitur yang tidak relevan
df = df.drop(columns=irrelevant_features)

# Menghapus kolom yang seluruh isinya adalah NaN
df = df.dropna(axis=1, how='all')

df

Unnamed: 0,Type,Width (inches),Depth (inches),Capacity - Maximum Number of Place Settings,Soil-Sensing Capability,Tub Material,Drying Method,Additional Product Features,Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),% Better than US Federal Standard (kWh/yr),Water Use (gallons/cycle),US Federal Standard (gallons/cycle),% Better than US Federal Standard (gallons/cycle),Date Certified
0,Standard,23.5,22.8,14,Yes,Stainless Steel,Automatic Door Release,"Top Controls,Cycle Status Lights,Hard Food Fil...",225,307,27,3.2,5,36,11/14/2023
1,Standard,23.6,22.5,14,Yes,Stainless Steel,Fan-Assisted Dry,Top Controls,202,307,34,3.1,5,38,07/12/2023
2,Standard,23.6,22.5,14,Yes,Stainless Steel,Fan-Assisted Dry,Top Controls,202,307,34,3.1,5,38,05/14/2024
3,Standard,24.0,21.8,16,No,Stainless Steel,Automatic Door Release,"Front Controls,Third Rack",220,307,28,3.0,5,40,10/21/2022
4,Standard,24.0,21.8,16,No,Stainless Steel,Automatic Door Release,"Top Controls,Cycle Status Lights,Third Rack",220,307,28,3.0,5,40,10/21/2022
5,Standard,24.0,21.8,16,No,Stainless Steel,Automatic Door Release,"Top Controls,Cycle Status Lights,Third Rack",220,307,28,3.0,5,40,10/21/2022
6,Standard,24.0,21.8,17,No,Stainless Steel,Automatic Door Release,"Top Controls,Cycle Status Lights,Third Rack",220,307,28,3.0,5,40,10/21/2022
7,Standard,24.0,21.8,16,No,Stainless Steel,Automatic Door Release,"Top Controls,Cycle Status Lights,Third Rack",220,307,28,3.0,5,40,10/21/2022
8,Standard,24.0,21.8,16,No,Stainless Steel,Automatic Door Release,"Top Controls,Cycle Status Lights,Third Rack",220,307,28,3.0,5,40,10/21/2022
9,Standard,24.0,21.8,17,No,Stainless Steel,Automatic Door Release,"Top Controls,Cycle Status Lights,Third Rack",220,307,28,3.0,5,40,10/21/2022


In [None]:
# Memisahkan kolom numerik dan kategorikal
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Encoding variabel kategorikal menggunakan One-Hot Encoding
df = pd.get_dummies(df, columns=categorical_cols)

# Scaling fitur numerik menggunakan StandardScaler
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Memisahkan dataset menjadi numerical_df dan categorical_df
numerical_df = df[numerical_cols]
categorical_df = df.drop(columns=numerical_cols)

# Menggabungkan kembali data yang sudah diproses
df_processed = pd.concat([numerical_df, categorical_df], axis=1)

df_processed

Unnamed: 0,Width (inches),Depth (inches),Capacity - Maximum Number of Place Settings,Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),% Better than US Federal Standard (kWh/yr),Water Use (gallons/cycle),US Federal Standard (gallons/cycle),% Better than US Federal Standard (gallons/cycle),Type_Standard,...,Date Certified_03/07/2024,Date Certified_05/14/2024,Date Certified_06/15/2023,Date Certified_07/12/2023,Date Certified_07/18/2023,Date Certified_07/19/2023,Date Certified_10/10/2023,Date Certified_10/21/2022,Date Certified_10/26/2022,Date Certified_11/14/2023
0,-0.606425,-0.607462,-0.049849,1.118723,0.0,-1.011628,0.817189,0.0,-0.815565,True,...,False,False,False,False,False,False,False,False,False,True
1,-0.599846,-0.626577,-0.049849,-2.041179,0.0,1.820931,0.542997,0.0,-0.544517,True,...,False,False,False,True,False,False,False,False,False,False
2,-0.599846,-0.626577,-0.049849,-2.041179,0.0,1.820931,0.542997,0.0,-0.544517,True,...,False,True,False,False,False,False,False,False,False,False
3,-0.573531,-0.671178,0.747734,0.431788,0.0,-0.606977,0.268806,0.0,-0.273468,True,...,False,False,False,False,False,False,False,True,False,False
4,-0.573531,-0.671178,0.747734,0.431788,0.0,-0.606977,0.268806,0.0,-0.273468,True,...,False,False,False,False,False,False,False,True,False,False
5,-0.573531,-0.671178,0.747734,0.431788,0.0,-0.606977,0.268806,0.0,-0.273468,True,...,False,False,False,False,False,False,False,True,False,False
6,-0.573531,-0.671178,1.146525,0.431788,0.0,-0.606977,0.268806,0.0,-0.273468,True,...,False,False,False,False,False,False,False,True,False,False
7,-0.573531,-0.671178,0.747734,0.431788,0.0,-0.606977,0.268806,0.0,-0.273468,True,...,False,False,False,False,False,False,False,True,False,False
8,-0.573531,-0.671178,0.747734,0.431788,0.0,-0.606977,0.268806,0.0,-0.273468,True,...,False,False,False,False,False,False,False,True,False,False
9,-0.573531,-0.671178,1.146525,0.431788,0.0,-0.606977,0.268806,0.0,-0.273468,True,...,False,False,False,False,False,False,False,True,False,False


In [None]:
# Memisahkan fitur (X) dan label (y)
X = df_processed.drop(columns=['Annual Energy Use (kWh/yr)'])
y = df_processed['Annual Energy Use (kWh/yr)']

# Pembagian dataset menjadi data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tampilkan ukuran data train dan test
print(f"Ukuran data train: {X_train.shape}, {y_train.shape}")
print(f"Ukuran data test: {X_test.shape}, {y_test.shape}")

Ukuran data train: (44, 39), (44,)
Ukuran data test: (12, 39), (12,)


In [None]:
# Inisialisasi model Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

# Melatih model Decision Tree Regressor
dt_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_dt = dt_regressor.predict(X_test)

# Evaluasi model Decision Tree Regressor
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Menyimpan hasil evaluasi ke dalam DataFrame
dt_regression_results = pd.DataFrame({
    "Model": ["Decision Tree Regressor"],
    "Mean Squared Error": [mse_dt],
    "R^2 Score": [r2_dt],
})

# Menampilkan hasil evaluasi regresi
print("\n", dt_regression_results)


                      Model  Mean Squared Error  R^2 Score
0  Decision Tree Regressor            0.001573   0.997579


In [None]:
# Inisialisasi model Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Melatih model Random Forest Regressor
rf_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_rf = rf_regressor.predict(X_test)

# Evaluasi model Random Forest Regressor
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Menyimpan hasil evaluasi ke dalam DataFrame
rf_regression_results = pd.DataFrame({
    "Model": ["Random Forest Regressor"],
    "Mean Squared Error": [mse_rf],
    "R^2 Score": [r2_rf],
})

# Menampilkan hasil evaluasi regresi
print("\n", rf_regression_results)


                      Model  Mean Squared Error  R^2 Score
0  Random Forest Regressor            0.009068   0.986041


In [None]:
# Inisialisasi model Linear Regression
lr_regressor = LinearRegression()

# Melatih model Linear Regression
lr_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_lr = lr_regressor.predict(X_test)

# Evaluasi model Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Menyimpan hasil evaluasi ke dalam DataFrame
lr_regression_results = pd.DataFrame({
    "Model": ["Linear Regression"],
    "Mean Squared Error": [mse_lr],
    "R^2 Score": [r2_lr],
})

# Menampilkan hasil evaluasi regresi
print("\n", lr_regression_results)


                Model  Mean Squared Error  R^2 Score
0  Linear Regression            0.001976   0.996958
