In [68]:
# Import Library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import os
import pickle

# Read Dataset
url = "https://raw.githubusercontent.com/josgiv/home-appliance-dataset/master/Refrigerators.csv"
df = pd.read_csv(url)

df

Unnamed: 0,ENERGY STAR Unique ID,Brand Name,Model Number,Additional Model Information,Type,UPC,Defrost Type,Compact,Built-in,Thru the Door Dispenser,...,Adjusted Volume (ft3),Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),Percent Less Energy Use than US Federal Standard,Connected Functionality,Date Available On Market,Date Certified,Markets,CB Model Identifier,Meets ENERGY STAR Most Efficient 2024 Criteria
0,3531032,Avanti,AVRPD75****,,Compact Refrigerator,,Automatic,Yes,No,No,...,7.5,250,380,34,Yes,05/14/2024,05/22/2024,"United States, Canada",ES_92257_AVRPD75_05232024120655_4918199,Yes
1,2395811,Avanti,FF10B***,,Top Freezer,,Automatic,No,No,No,...,12.2,297,332,11,No,04/22/2022,04/22/2022,"United States, Canada",ES_0092257_FF10B***_05122022033138_5550001_,Yes
2,2389195,Avanti,FF14V0W,,Top Freezer,79841211401,Automatic,No,No,No,...,17.0,332,371,11,No,12/15/2021,12/13/2021,United States,ES_92257_FF14V0W_102020212247527_7644327,Yes
3,2389196,Avanti,FF14V1B,,Top Freezer,79841211418,Automatic,No,No,No,...,17.0,332,371,11,No,12/15/2021,12/13/2021,United States,ES_92257_FF14V1B_102020212247391_3555338,Yes
4,2389194,Avanti,FF14V3S,,Top Freezer,79841211432,Automatic,No,No,No,...,17.0,332,371,11,No,12/15/2021,12/13/2021,United States,ES_92257_FF14V3S_102020212247204_8702264,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,2305079,Whirlpool,WRT541SZH*0*,,Top Freezer,883049451978,Automatic,No,No,No,...,26.0,399,444,10,No,12/01/2017,10/16/2017,"United States, Canada",ES_0022856_WRT541SZH*0*_10162017122646_70159008,Yes
595,2305080,Whirlpool,WRT541SZH*0*,,Top Freezer,883049451978,Automatic,No,No,No,...,26.0,483,528,10,No,12/01/2017,10/16/2017,"United States, Canada",ES_0022856_WRT541SZH*0*_10162017122647_70159008,Yes
596,2218984,Whirlpool,WRT549SZD*,,Top Freezer,883049339184;883049339191;883049339207,Automatic,No,No,No,...,23.1,378,420,10,No,09/15/2014,09/09/2014,"United States, Canada",ES_0022856_WRT549SZD*_09092014043753_2760577,Yes
597,2272129,Whirlpool,WRT549SZD*,,Top Freezer,883049339184;883049339191;883049339207,Automatic,No,No,No,...,23.1,462,504,10,No,05/13/2015,06/14/2016,"United States, Canada",ES_0022856_WRT549SZD*_07112016115240_70084302,Yes


In [69]:
df.columns

Index(['ENERGY STAR Unique ID', 'Brand Name', 'Model Number',
       'Additional Model Information', 'Type', 'UPC', 'Defrost Type',
       'Compact', 'Built-in', 'Thru the Door Dispenser', 'Ice Maker',
       'Counter Depth', 'Height (in)', 'Width (in)',
       'Capacity (Total Volume) (ft3)', 'Adjusted Volume (ft3)',
       'Annual Energy Use (kWh/yr)', 'US Federal Standard (kWh/yr)',
       'Percent Less Energy Use than US Federal Standard',
       'Connected Functionality', 'Date Available On Market', 'Date Certified',
       'Markets', 'CB Model Identifier',
       'Meets ENERGY STAR Most Efficient 2024 Criteria'],
      dtype='object')

In [70]:
# Fitur-fitur yang mungkin tidak penting untuk prediksi kWh/yr

irrelevant_features = ['ENERGY STAR Unique ID', 'Brand Name', 'Model Number',
                       'Additional Model Information', 'UPC', 'CB Model Identifier',
                       'Date Available On Market', 'Date Certified',
                       'Markets', 'Defrost Type', 
                       'Built-in',
                       'Counter Depth', 'Meets ENERGY STAR Most Efficient 2024 Criteria']



# Menghapus fitur-fitur yang tidak relevan
df = df.drop(columns=irrelevant_features)

# Menghapus kolom yang seluruh isinya adalah NaN
df = df.dropna(axis=1, how='all')

df

Unnamed: 0,Type,Compact,Thru the Door Dispenser,Ice Maker,Height (in),Width (in),Capacity (Total Volume) (ft3),Adjusted Volume (ft3),Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),Percent Less Energy Use than US Federal Standard,Connected Functionality
0,Compact Refrigerator,Yes,No,No,56.3,21.9,7.5,7.5,250,380,34,Yes
1,Top Freezer,No,No,No,59.6,23.7,10.1,12.2,297,332,11,No
2,Top Freezer,No,No,No,68.0,27.6,14.3,17.0,332,371,11,No
3,Top Freezer,No,No,No,68.0,27.6,14.3,17.0,332,371,11,No
4,Top Freezer,No,No,No,68.0,27.6,14.3,17.0,332,371,11,No
...,...,...,...,...,...,...,...,...,...,...,...,...
594,Top Freezer,No,No,No,64.8,32.5,21.3,26.0,399,444,10,No
595,Top Freezer,No,No,Yes,64.8,32.5,21.3,26.0,483,528,10,No
596,Top Freezer,No,No,No,65.4,29.8,19.3,23.1,378,420,10,No
597,Top Freezer,No,No,Yes,65.4,29.8,19.3,23.1,462,504,10,No


In [71]:
df.columns

Index(['Type', 'Compact', 'Thru the Door Dispenser', 'Ice Maker',
       'Height (in)', 'Width (in)', 'Capacity (Total Volume) (ft3)',
       'Adjusted Volume (ft3)', 'Annual Energy Use (kWh/yr)',
       'US Federal Standard (kWh/yr)',
       'Percent Less Energy Use than US Federal Standard',
       'Connected Functionality'],
      dtype='object')

In [72]:
# Loop melalui setiap kolom di DataFrame 'df'
print(f"DataFrame:")
for col in df.columns:
    unique_values = df[col].unique()
    print(f"Kolom '{col}': {unique_values}")
print("\n")

DataFrame:
Kolom 'Type': ['Compact Refrigerator' 'Top Freezer' 'Bottom Freezer' 'Side-by-Side']
Kolom 'Compact': ['Yes' 'No']
Kolom 'Thru the Door Dispenser': ['No' 'Yes']
Kolom 'Ice Maker': ['No' 'Yes']
Kolom 'Height (in)': [56.3 59.6 68.  66.2 32.5 59.  58.8 59.8 65.4 17.6 67.  66.9 67.6 67.7
 69.7 66.5 61.  66.4 60.6 65.  66.  68.1 36.  32.9 33.8 34.  21.7 22.2
 23.4 16.7 33.3 59.5 66.6 67.4 70.  59.9 66.1 14.  73.6 60.3 60.1 65.1
 64.6 65.9 69.  19.  61.8 59.7 69.1 66.8 64.8 67.3 37.4 64.9 66.3 66.7
 80.  73.  79.  32.3 44.6 51.  55.1 65.6 69.9 67.8 59.1 31.  19.4 70.1
 60.  75.6 68.8 32.  65.8 62. ]
Kolom 'Width (in)': [21.9 23.7 27.6 29.5 17.8 24.  23.8 25.  29.8 19.4 33.  28.  23.2 23.4
 23.6 29.7 32.7 30.  27.9 27.  18.5 23.9 15.9 19.3 21.7 18.9 28.9 35.8
 28.7 23.3 29.6 27.4 19.  36.3 24.1 29.9 32.9 22.  23.5 15.  20.6 21.8
 27.5 29.  26.2 18.6 35.9 28.8 31.3 32.5]
Kolom 'Capacity (Total Volume) (ft3)': [ 7.5 10.1 14.3 18.3  3.  10.3 10.  11.5  1.7 21.   8.3 13.6 10.7 18.
 11.

In [73]:
# Memisahkan kolom numerik dan kategorikal
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Scaling fitur numerik menggunakan StandardScaler
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Memisahkan dataset menjadi numerical_df dan categorical_df
numerical_df = df[numerical_cols]
categorical_df = df.drop(columns=numerical_cols)

# Menggabungkan kembali data yang sudah diproses
df_processed = pd.concat([numerical_df, categorical_df], axis=1)

df_processed

Unnamed: 0,Height (in),Width (in),Capacity (Total Volume) (ft3),Adjusted Volume (ft3),Annual Energy Use (kWh/yr),US Federal Standard (kWh/yr),Percent Less Energy Use than US Federal Standard,Type,Compact,Thru the Door Dispenser,Ice Maker,Connected Functionality
0,-0.372057,-1.243179,-1.250094,-1.429137,-1.124749,-0.290867,1.673658,Compact Refrigerator,Yes,No,No,Yes
1,-0.107070,-0.817805,-0.777305,-0.731772,-0.550483,-0.905567,-0.469875,Top Freezer,No,No,No,No
2,0.567443,0.103838,-0.013570,-0.019569,-0.122837,-0.406123,-0.469875,Top Freezer,No,No,No,No
3,0.567443,0.103838,-0.013570,-0.019569,-0.122837,-0.406123,-0.469875,Top Freezer,No,No,No,No
4,0.567443,0.103838,-0.013570,-0.019569,-0.122837,-0.406123,-0.469875,Top Freezer,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...
594,0.310485,1.261801,1.259322,1.315812,0.695798,0.528734,-0.563072,Top Freezer,No,No,No,No
595,0.310485,1.261801,1.259322,1.315812,1.722148,1.604459,-0.563072,Top Freezer,No,No,Yes,No
596,0.358665,0.623740,0.895639,0.885523,0.439211,0.221384,-0.563072,Top Freezer,No,No,No,No
597,0.358665,0.623740,0.895639,0.885523,1.465560,1.297109,-0.563072,Top Freezer,No,No,Yes,No


In [74]:
df.columns

Index(['Type', 'Compact', 'Thru the Door Dispenser', 'Ice Maker',
       'Height (in)', 'Width (in)', 'Capacity (Total Volume) (ft3)',
       'Adjusted Volume (ft3)', 'Annual Energy Use (kWh/yr)',
       'US Federal Standard (kWh/yr)',
       'Percent Less Energy Use than US Federal Standard',
       'Connected Functionality'],
      dtype='object')

In [75]:

# Pisahkan kolom numerik dan kategorikal
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Pisahkan fitur dan target
X = df[numerical_cols]  # Gunakan hanya fitur numerik
y = df['Annual Energy Use (kWh/yr)']  # Asumsi 'annual_energy_use_kwh_yr' adalah target

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling fitur numerik menggunakan StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [76]:
# Inisialisasi model Linear Regression
lr_regressor = LinearRegression()

# Melatih model Linear Regression
lr_regressor.fit(X_train_scaled, y_train)

# Prediksi dengan model yang dilatih
y_pred_lr = lr_regressor.predict(X_test_scaled)

# Evaluasi model Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Menyimpan hasil evaluasi ke dalam DataFrame
lr_regression_results = pd.DataFrame({
    "Model": ["Linear Regression"],
    "Mean Squared Error": [mse_lr],
    "R^2 Score": [r2_lr],
})

# Menampilkan hasil evaluasi regresi
print(lr_regression_results)

               Model  Mean Squared Error  R^2 Score
0  Linear Regression        5.368687e-31        1.0


#### Menyimpan Model dengan akurasi tertinggi sebagai Pickel ####

In [77]:
# Path untuk menyimpan model pickle
model_path = '../../../app/models-pickle/house-energy/refrigerators.pkl'

# Pastikan direktori untuk menyimpan model sudah ada
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Simpan model ke file .pkl
with open(model_path, 'wb') as file:
    pickle.dump(lr_regressor, file)

print(f'Model Linear Regression disimpan ke {model_path}')

Model Linear Regression disimpan ke ../../../app/models-pickle/house-energy/refrigerators.pkl
