In [None]:
# Import Library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Read Dataset
url = "https://raw.githubusercontent.com/josgiv/home-appliance-dataset/master/Refrigerators.csv"
df = pd.read_csv(url)

df

Unnamed: 0,ENERGY STAR Unique ID,Brand Name,Model Number,Additional Model Information,Type,UPC,Defrost Type,Compact,Built-in,Thru the Door Dispenser,...,Adjusted Volume (ft3),Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),Percent Less Energy Use than US Federal Standard,Connected Functionality,Date Available On Market,Date Certified,Markets,CB Model Identifier,Meets ENERGY STAR Most Efficient 2024 Criteria
0,3531032,Avanti,AVRPD75****,,Compact Refrigerator,,Automatic,Yes,No,No,...,7.5,250,380,34,Yes,05/14/2024,05/22/2024,"United States, Canada",ES_92257_AVRPD75_05232024120655_4918199,Yes
1,2395811,Avanti,FF10B***,,Top Freezer,,Automatic,No,No,No,...,12.2,297,332,11,No,04/22/2022,04/22/2022,"United States, Canada",ES_0092257_FF10B***_05122022033138_5550001_,Yes
2,2389195,Avanti,FF14V0W,,Top Freezer,79841211401,Automatic,No,No,No,...,17.0,332,371,11,No,12/15/2021,12/13/2021,United States,ES_92257_FF14V0W_102020212247527_7644327,Yes
3,2389196,Avanti,FF14V1B,,Top Freezer,79841211418,Automatic,No,No,No,...,17.0,332,371,11,No,12/15/2021,12/13/2021,United States,ES_92257_FF14V1B_102020212247391_3555338,Yes
4,2389194,Avanti,FF14V3S,,Top Freezer,79841211432,Automatic,No,No,No,...,17.0,332,371,11,No,12/15/2021,12/13/2021,United States,ES_92257_FF14V3S_102020212247204_8702264,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,2305079,Whirlpool,WRT541SZH*0*,,Top Freezer,883049451978,Automatic,No,No,No,...,26.0,399,444,10,No,12/01/2017,10/16/2017,"United States, Canada",ES_0022856_WRT541SZH*0*_10162017122646_70159008,Yes
595,2305080,Whirlpool,WRT541SZH*0*,,Top Freezer,883049451978,Automatic,No,No,No,...,26.0,483,528,10,No,12/01/2017,10/16/2017,"United States, Canada",ES_0022856_WRT541SZH*0*_10162017122647_70159008,Yes
596,2218984,Whirlpool,WRT549SZD*,,Top Freezer,883049339184;883049339191;883049339207,Automatic,No,No,No,...,23.1,378,420,10,No,09/15/2014,09/09/2014,"United States, Canada",ES_0022856_WRT549SZD*_09092014043753_2760577,Yes
597,2272129,Whirlpool,WRT549SZD*,,Top Freezer,883049339184;883049339191;883049339207,Automatic,No,No,No,...,23.1,462,504,10,No,05/13/2015,06/14/2016,"United States, Canada",ES_0022856_WRT549SZD*_07112016115240_70084302,Yes


In [None]:
# Fitur-fitur yang mungkin tidak penting untuk prediksi kWh/yr

irrelevant_features = ['ENERGY STAR Unique ID', 'Brand Name', 'Model Number',
                       'Additional Model Information', 'UPC', 'CB Model Identifier',
                       'Date Available On Market', 'Date Certified']

# Menghapus fitur-fitur yang tidak relevan
df = df.drop(columns=irrelevant_features)

# Menghapus kolom yang seluruh isinya adalah NaN
df = df.dropna(axis=1, how='all')

df

Unnamed: 0,Type,Defrost Type,Compact,Built-in,Thru the Door Dispenser,Ice Maker,Counter Depth,Height (in),Width (in),Capacity (Total Volume) (ft3),Adjusted Volume (ft3),Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),Percent Less Energy Use than US Federal Standard,Connected Functionality,Markets,Meets ENERGY STAR Most Efficient 2024 Criteria
0,Compact Refrigerator,Automatic,Yes,No,No,No,Yes,56.3,21.9,7.5,7.5,250,380,34,Yes,"United States, Canada",Yes
1,Top Freezer,Automatic,No,No,No,No,No,59.6,23.7,10.1,12.2,297,332,11,No,"United States, Canada",Yes
2,Top Freezer,Automatic,No,No,No,No,No,68.0,27.6,14.3,17.0,332,371,11,No,United States,Yes
3,Top Freezer,Automatic,No,No,No,No,No,68.0,27.6,14.3,17.0,332,371,11,No,United States,Yes
4,Top Freezer,Automatic,No,No,No,No,No,68.0,27.6,14.3,17.0,332,371,11,No,United States,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,Top Freezer,Automatic,No,No,No,No,No,64.8,32.5,21.3,26.0,399,444,10,No,"United States, Canada",Yes
595,Top Freezer,Automatic,No,No,No,Yes,No,64.8,32.5,21.3,26.0,483,528,10,No,"United States, Canada",Yes
596,Top Freezer,Automatic,No,No,No,No,No,65.4,29.8,19.3,23.1,378,420,10,No,"United States, Canada",Yes
597,Top Freezer,Automatic,No,No,No,Yes,No,65.4,29.8,19.3,23.1,462,504,10,No,"United States, Canada",Yes


In [None]:
# Memisahkan kolom numerik dan kategorikal
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Encoding variabel kategorikal menggunakan One-Hot Encoding
df = pd.get_dummies(df, columns=categorical_cols)

# Scaling fitur numerik menggunakan StandardScaler
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Memisahkan dataset menjadi numerical_df dan categorical_df
numerical_df = df[numerical_cols]
categorical_df = df.drop(columns=numerical_cols)

# Menggabungkan kembali data yang sudah diproses
df_processed = pd.concat([numerical_df, categorical_df], axis=1)

df_processed

Unnamed: 0,Height (in),Width (in),Capacity (Total Volume) (ft3),Adjusted Volume (ft3),Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),Percent Less Energy Use than US Federal Standard,Type_Bottom Freezer_False,Type_Bottom Freezer_True,Type_Compact Refrigerator_False,...,Markets_Canada_True,Markets_United States_False,Markets_United States_True,"Markets_United States, Canada_False","Markets_United States, Canada_True","Markets_United States, Taiwan_False","Markets_United States, Taiwan_True","Markets_United States, Taiwan, Canada_False","Markets_United States, Taiwan, Canada_True",Meets ENERGY STAR Most Efficient 2024 Criteria_Yes_True
0,-0.372057,-1.243179,-1.250094,-1.429137,-1.124749,-0.290867,1.673658,True,False,False,...,False,True,False,False,True,True,False,True,False,True
1,-0.107070,-0.817805,-0.777305,-0.731772,-0.550483,-0.905567,-0.469875,True,False,True,...,False,True,False,False,True,True,False,True,False,True
2,0.567443,0.103838,-0.013570,-0.019569,-0.122837,-0.406123,-0.469875,True,False,True,...,False,False,True,True,False,True,False,True,False,True
3,0.567443,0.103838,-0.013570,-0.019569,-0.122837,-0.406123,-0.469875,True,False,True,...,False,False,True,True,False,True,False,True,False,True
4,0.567443,0.103838,-0.013570,-0.019569,-0.122837,-0.406123,-0.469875,True,False,True,...,False,False,True,True,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,0.310485,1.261801,1.259322,1.315812,0.695798,0.528734,-0.563072,True,False,True,...,False,True,False,False,True,True,False,True,False,True
595,0.310485,1.261801,1.259322,1.315812,1.722148,1.604459,-0.563072,True,False,True,...,False,True,False,False,True,True,False,True,False,True
596,0.358665,0.623740,0.895639,0.885523,0.439211,0.221384,-0.563072,True,False,True,...,False,True,False,False,True,True,False,True,False,True
597,0.358665,0.623740,0.895639,0.885523,1.465560,1.297109,-0.563072,True,False,True,...,False,True,False,False,True,True,False,True,False,True


In [None]:
# Memisahkan fitur (X) dan label (y)
X = df_processed.drop(columns=['Annual Energy Use (kWh/yr)'])
y = df_processed['Annual Energy Use (kWh/yr)']

# Pembagian dataset menjadi data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tampilkan ukuran data train dan test
print(f"Ukuran data train: {X_train.shape}, {y_train.shape}")
print(f"Ukuran data test: {X_test.shape}, {y_test.shape}")

Ukuran data train: (479, 53), (479,)
Ukuran data test: (120, 53), (120,)


In [None]:
# Inisialisasi model Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

# Melatih model Decision Tree Regressor
dt_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_dt = dt_regressor.predict(X_test)

# Evaluasi model Decision Tree Regressor
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Menyimpan hasil evaluasi ke dalam DataFrame
dt_regression_results = pd.DataFrame({
    "Model": ["Decision Tree Regressor"],
    "Mean Squared Error": [mse_dt],
    "R^2 Score": [r2_dt],
})

# Menampilkan hasil evaluasi regresi
print("\n", dt_regression_results)


                      Model  Mean Squared Error  R^2 Score
0  Decision Tree Regressor             0.01309   0.989683


In [None]:
# Inisialisasi model Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Melatih model Random Forest Regressor
rf_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_rf = rf_regressor.predict(X_test)

# Evaluasi model Random Forest Regressor
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Menyimpan hasil evaluasi ke dalam DataFrame
rf_regression_results = pd.DataFrame({
    "Model": ["Random Forest Regressor"],
    "Mean Squared Error": [mse_rf],
    "R^2 Score": [r2_rf],
})

# Menampilkan hasil evaluasi regresi
print("\n", rf_regression_results)


                      Model  Mean Squared Error  R^2 Score
0  Random Forest Regressor            0.004083   0.996782


In [None]:
# Inisialisasi model Linear Regression
lr_regressor = LinearRegression()

# Melatih model Linear Regression
lr_regressor.fit(X_train, y_train)

# Prediksi dengan model yang dilatih
y_pred_lr = lr_regressor.predict(X_test)

# Evaluasi model Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Menyimpan hasil evaluasi ke dalam DataFrame
lr_regression_results = pd.DataFrame({
    "Model": ["Linear Regression"],
    "Mean Squared Error": [mse_lr],
    "R^2 Score": [r2_lr],
})

# Menampilkan hasil evaluasi regresi
print("\n", lr_regression_results)


                Model  Mean Squared Error  R^2 Score
0  Linear Regression               0.002   0.998424
