In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

import os

# Read Dataset
url = "https://raw.githubusercontent.com/josgiv/home-appliance-dataset/master/Televisions.csv"
df = pd.read_csv(url)

df

Unnamed: 0,ENERGY STAR Unique ID,ENERGY STAR Partner,Brand Name,Model Name,Model Number,Additional Model Information,UPC,Product Type,Application,Display Type,...,Reported On Mode Power (per the Federal Test Procedure) (watts),Reported Annual Energy Consumption (kWh),Measured Standby Mode Power (Watts),Power Consumption in Standby Mode when Not Connected to a Network (watts),Power Consumption in Standby Mode when Connected to a Network (watts),"Reported Standby-Active, Low Mode Power (watts)",Date Available On Market,Date Certified,Markets,CB Model Identifier
0,3049373,"LG Electronics, Inc.",LG,OLED55B4PU*,OLED55B4PU*,"OLED55B4AU*,OLED55B4AU*,OLED55B4AU* is the sam...",195174077290;195174077306,Television (TV),Consumer,OLED,...,95.50,177.0,0.4,0.3,0.4,0.4,02/20/2024,12/19/2023,United States,ES_1118034_OLED55B4PU*_121920231333329_5810272
1,2838654,"LG Electronics, Inc.",LG,OLED55C4PU*,OLED55C4PU*,"OLED55C4AU*,OLED55C4AU*,OLED55C4AU* is the sam...",195174077238;195174078150,Television (TV),Consumer,OLED,...,103.10,191.0,,0.3,0.4,0.4,02/20/2024,12/05/2023,United States,ES_1118034_OLED55C4PU*_120520230754834_6379453
2,3049374,"LG Electronics, Inc.",LG,OLED65B4PU*,OLED65B4PU*,"OLED65B4AU*,OLED65B4AU*,OLED65B4AU* is the sam...",195174077252;195174078174,Television (TV),Consumer,OLED,...,114.00,211.0,0.4,0.3,0.4,0.4,04/07/2024,12/20/2023,United States,ES_1118034_OLED65B4PU*_122020231113408_7049302
3,2838655,"LG Electronics, Inc.",LG,OLED65C4PU*,OLED65C4PU*,"OLED65C4AU*,OLED65C4AU*,OLED65C4AU* is the sam...",195174077207;195174078181,Television (TV),Consumer,OLED,...,131.90,244.0,,0.3,0.5,0.5,02/20/2024,12/05/2023,United States,ES_1118034_OLED65C4PU*_120520231442163_1126459
4,3143234,"LG Electronics, Inc.",LG,OLED77B4PU*,OLED77B4PU*,"OLED77B4AU*,OLED77B4AU*,OLED77B4AU* is the sam...",195174077214;195174077221,Television (TV),Consumer,OLED,...,142.80,263.0,0.4,0.3,0.4,0.4,04/07/2024,12/21/2023,United States,ES_1118034_OLED77B4PU*_122120230719523_8888606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,3354310,"Samsung Electronics Co., Ltd.",Samsung,QN55S90DAF,QN55S90DAF,"QN55S90DDF,QN55S90DDF,Same as basic model exce...",,Television (TV),Consumer,OLED,...,98.58,184.1,,0.3,0.6,,12/29/2023,12/18/2023,United States,ES_1023593_QN55S90DAF_121820230432730_2283422
105,2406405,"Samsung Electronics Co., Ltd.",Samsung,QN55S95CAF,QN55S95CAF,"QN55S95CDF,QN55S95CDF,Same as basic model exce...",,Television (TV),Consumer,Other,...,100.00,192.9,,0.3,0.4,0.4,02/10/2023,12/29/2022,United States,ES_1023593_QN55S95CAF_123020220352905_3086937
106,3354428,"Samsung Electronics Co., Ltd.",Samsung,QN55S95DAF,QN55S95DAF,"QN55S95DDF,QN55S95DDF,Same as basic model exce...",,Television (TV),Consumer,OLED,...,112.43,245.8,,0.3,0.7,,01/02/2024,12/26/2023,United States,ES_1023593_QN55S95DAF_122620230523739_4325753
107,2406379,"Samsung Electronics Co., Ltd.",Samsung,QN65QN85CAF,QN65QN85CAF,"QN65QN85CDF,QN65QN85CDF,Same as basic model ex...",,Television (TV),Consumer,QD-LED (QLED),...,110.86,160.1,,0.3,0.4,0.3,12/31/2022,12/28/2022,United States,ES_1023593_QN65QN85CAF_122820220822846_3841384


In [14]:
print(df.columns)

Index(['ENERGY STAR Unique ID', 'ENERGY STAR Partner', 'Brand Name',
       'Model Name', 'Model Number', 'Additional Model Information', 'UPC',
       'Product Type', 'Application', 'Display Type',
       'Backlight Technology Type', 'Diagonal Viewable Screen Size (in.)',
       'Screen Area (sq. in.)', 'Native Horizontal Resolution (pixels)',
       'Native Vertical Resolution (pixels)', 'Resolution Format',
       'High Contrast Ratio (HCR) Display', 'Physical Data Ports Available',
       'Ethernet Supported', 'Low Power Wireless Technologies Supported',
       'Features', 'Automatic Brightness Control',
       'Is Automatic Brightness Control Enabled by Default in the Default SDR Preset Picture Setting When Television is Shipped?',
       'Average On Mode Power Consumption for Certification (watts)',
       'Maximum Average On Mode Power for Certification (watts)',
       'Reported On Mode Power (per the Federal Test Procedure) (watts)',
       'Reported Annual Energy Consumption 

In [15]:

irrelevant_features = [
    'ENERGY STAR Unique ID',
    'ENERGY STAR Partner',
    'UPC',
    'Native Horizontal Resolution (pixels)',
    'Native Vertical Resolution (pixels)',
    'Date Available On Market',
    'Date Certified',
    'Model Name',
    'Model Number',
    'Additional Model Information',
    'Product Type',
    'CB Model Identifier',
    'Markets',
    'Application',
    'Screen Area (sq. in.)',
    'Reported On Mode Power (per the Federal Test Procedure) (watts)',
    'Power Consumption in Standby Mode when Not Connected to a Network (watts)',
    'Power Consumption in Standby Mode when Connected to a Network (watts)',
    'Reported Standby-Active, Low Mode Power (watts)',
    'Measured Standby Mode Power (Watts)',       
    'Average On Mode Power Consumption for Certification (watts)',
    'Maximum Average On Mode Power for Certification (watts)',
    'Features',
    
    ]



# Menghapus fitur-fitur yang tidak relevan


df = df.drop(columns=irrelevant_features)

In [16]:
# Ubah nama kolom
df.rename(columns={
    'Is Automatic Brightness Control Enabled by Default in the Default SDR Preset Picture Setting When Television is Shipped?': 'Auto Brightness'
}, inplace=True)

df

Unnamed: 0,Brand Name,Display Type,Backlight Technology Type,Diagonal Viewable Screen Size (in.),Resolution Format,High Contrast Ratio (HCR) Display,Physical Data Ports Available,Ethernet Supported,Low Power Wireless Technologies Supported,Automatic Brightness Control,Auto Brightness,Reported Annual Energy Consumption (kWh)
0,LG,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,177.0
1,LG,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,191.0
2,LG,OLED,OLED,64.5,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,211.0
3,LG,OLED,OLED,64.5,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,244.0
4,LG,OLED,OLED,76.7,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,263.0
...,...,...,...,...,...,...,...,...,...,...,...,...
104,Samsung,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,184.1
105,Samsung,Other,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,192.9
106,Samsung,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,245.8
107,Samsung,QD-LED (QLED),Direct-lit LED,64.5,4K (UHD),No,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,160.1


In [17]:
print(df.columns)

Index(['Brand Name', 'Display Type', 'Backlight Technology Type',
       'Diagonal Viewable Screen Size (in.)', 'Resolution Format',
       'High Contrast Ratio (HCR) Display', 'Physical Data Ports Available',
       'Ethernet Supported', 'Low Power Wireless Technologies Supported',
       'Automatic Brightness Control', 'Auto Brightness',
       'Reported Annual Energy Consumption (kWh)'],
      dtype='object')


In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# %%
# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Print processed dataframe
print(df.head())

  Brand Name Display Type Backlight Technology Type  \
0         LG         OLED                      OLED   
1         LG         OLED                      OLED   
2         LG         OLED                      OLED   
3         LG         OLED                      OLED   
4         LG         OLED                      OLED   

   Diagonal Viewable Screen Size (in.) Resolution Format  \
0                                 54.6          4K (UHD)   
1                                 54.6          4K (UHD)   
2                                 64.5          4K (UHD)   
3                                 64.5          4K (UHD)   
4                                 76.7          4K (UHD)   

  High Contrast Ratio (HCR) Display  \
0                               Yes   
1                               Yes   
2                               Yes   
3                               Yes   
4                               Yes   

                  Physical Data Ports Available          Ethernet Support

In [19]:
# Loop melalui setiap kolom di DataFrame 'df'
print(f"DataFrame:")
for col in df.columns:
    unique_values = df[col].unique()
    print(f"Kolom '{col}': {unique_values}")
print("\n")



DataFrame:
Kolom 'Brand Name': ['LG' 'Clear Tunes' 'Emerson' 'NEC' 'Samsung' 'Insignia' 'RCA' 'Sansui'
 'SANSUI.' 'IMPECCA' 'Silo' 'PHILIPS' 'XITRIX' 'SCEPTRE']
Kolom 'Display Type': ['OLED' 'LCD' 'LCD,Other' 'Other' 'QD-LED (QLED)']
Kolom 'Backlight Technology Type': ['OLED' 'Direct-lit LED' 'Edge-lit LED' 'Other']
Kolom 'Diagonal Viewable Screen Size (in.)': [54.6  64.5  76.7  82.5  13.23 13.25 15.55 15.47 18.49 31.6  42.5  55.
 41.5  47.5  49.5  23.53 31.5  31.47 32.   49.61 40.   64.4  74.37 24.
 84.6  23.55 39.5  74.5  85.6  41.6  74.6  54.64 64.53 54.5  57.5  58.
 65.   76.6  77.5  84.5  31.51 50.  ]
Kolom 'Resolution Format': ['4K (UHD)' 'Other' 'High Definition (HD)' 'Standard Definition (SD)']
Kolom 'High Contrast Ratio (HCR) Display': ['Yes' 'No']
Kolom 'Physical Data Ports Available': ['Universal Serial Bus (USB),Other,RS-232,HDMI'
 'Universal Serial Bus (USB),Other,HDMI' 'Universal Serial Bus (USB),HDMI'
 'Universal Serial Bus (USB)' 'Other']
Kolom 'Ethernet Supported': ['F

In [20]:
df.columns

Index(['Brand Name', 'Display Type', 'Backlight Technology Type',
       'Diagonal Viewable Screen Size (in.)', 'Resolution Format',
       'High Contrast Ratio (HCR) Display', 'Physical Data Ports Available',
       'Ethernet Supported', 'Low Power Wireless Technologies Supported',
       'Automatic Brightness Control', 'Auto Brightness',
       'Reported Annual Energy Consumption (kWh)'],
      dtype='object')

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

# Asumsikan df sudah terimport dan siap digunakan

# Separate features (X) and label (y)
X = df.drop(columns=['Reported Annual Energy Consumption (kWh)'])
y = df['Reported Annual Energy Consumption (kWh)']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Definisikan kolom kategorikal
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Inisialisasi OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Fit dan transform data kategorikal pada X_train
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])

# Transform data kategorikal pada X_test
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Dapatkan nama kolom hasil OneHotEncoder
encoded_columns = encoder.get_feature_names_out(categorical_cols)

# Konversi hasil encoding menjadi DataFrame
X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=encoded_columns)
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=encoded_columns)

# Gabungkan hasil encoding dengan bagian non-kategorikal dari X_train dan X_test
X_train_processed = pd.concat([X_train.drop(columns=categorical_cols).reset_index(drop=True), X_train_encoded_df], axis=1)
X_test_processed = pd.concat([X_test.drop(columns=categorical_cols).reset_index(drop=True), X_test_encoded_df], axis=1)

# List pipeline untuk berbagai model
pipelines = [
    Pipeline([('regressor', RandomForestRegressor(random_state=42))]),
    Pipeline([('regressor', LinearRegression())]),
    Pipeline([('regressor', GradientBoostingRegressor(random_state=42))]),
    Pipeline([('regressor', DecisionTreeRegressor(random_state=42))])
]

# List nama model untuk referensi dalam hasil
model_names = ['Random Forest', 'Linear Regression', 'Gradient Boosting', 'Decision Tree']

# List untuk menyimpan hasil evaluasi dari berbagai model
results = []

# Loop untuk melatih dan mengevaluasi setiap model
for pipeline, model_name in zip(pipelines, model_names):
    # Melatih model
    pipeline.fit(X_train_processed, y_train)
    
    # Prediksi menggunakan data uji
    y_pred = pipeline.predict(X_test_processed)
    
    # Menghitung mean squared error dan r2 score
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Menyimpan hasil evaluasi dalam DataFrame
    model_results = pd.DataFrame({
        "Model": [model_name],
        "Mean Squared Error": [mse],
        "R^2 Score": [r2],
    })
    
    # Menambahkan hasil evaluasi ke dalam list
    results.append(model_results)

# Menggabungkan semua hasil evaluasi menjadi satu DataFrame
all_results = pd.concat(results, ignore_index=True)

# Menampilkan hasil evaluasi untuk semua model
print("\nHasil Evaluasi untuk Semua Model:")
print(all_results)



Hasil Evaluasi untuk Semua Model:
               Model  Mean Squared Error  R^2 Score
0      Random Forest         1181.379260   0.895255
1  Linear Regression         2119.130138   0.812110
2  Gradient Boosting          809.844078   0.928196
3      Decision Tree          567.099327   0.949719


In [22]:
df

Unnamed: 0,Brand Name,Display Type,Backlight Technology Type,Diagonal Viewable Screen Size (in.),Resolution Format,High Contrast Ratio (HCR) Display,Physical Data Ports Available,Ethernet Supported,Low Power Wireless Technologies Supported,Automatic Brightness Control,Auto Brightness,Reported Annual Energy Consumption (kWh)
0,LG,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,177.0
1,LG,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,191.0
2,LG,OLED,OLED,64.5,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,211.0
3,LG,OLED,OLED,64.5,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,244.0
4,LG,OLED,OLED,76.7,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,263.0
...,...,...,...,...,...,...,...,...,...,...,...,...
104,Samsung,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,184.1
105,Samsung,Other,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,192.9
106,Samsung,OLED,OLED,54.6,4K (UHD),Yes,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,245.8
107,Samsung,QD-LED (QLED),Direct-lit LED,64.5,4K (UHD),No,"Universal Serial Bus (USB),Other,RS-232,HDMI",Fast Ethernet (100 Mbit/s),Bluetooth,Yes,No,160.1


In [23]:
df.columns

Index(['Brand Name', 'Display Type', 'Backlight Technology Type',
       'Diagonal Viewable Screen Size (in.)', 'Resolution Format',
       'High Contrast Ratio (HCR) Display', 'Physical Data Ports Available',
       'Ethernet Supported', 'Low Power Wireless Technologies Supported',
       'Automatic Brightness Control', 'Auto Brightness',
       'Reported Annual Energy Consumption (kWh)'],
      dtype='object')

#### Menyimpan Model dengan akurasi tertinggi sebagai Pickel ####

In [24]:
# Pipeline untuk preprocessing dengan OneHotEncoder dan model Decision Tree Regressor
pipeline_dt = Pipeline(steps=[
    ('preprocessor', OneHotEncoder(handle_unknown='ignore')),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

In [25]:
# Melatih model Decision Tree
pipeline_dt.fit(X_train, y_train)

# Path direktori untuk menyimpan model yang sudah ada
model_path = '../../../app/models-pickle/house-energy/televisions.pkl'

# Simpan model Decision Tree ke file .pkl
with open(model_path, 'wb') as file:
    pickle.dump(pipeline_dt, file)

print(f'Model Decision Tree disimpan ke {model_path}')

Model Decision Tree disimpan ke ../../../app/models-pickle/house-energy/televisions.pkl
