##Import Library dan Dataset

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

url = 'https://raw.githubusercontent.com/farrelrassya/teachingMLDL/main/02.%20Deep%20Learning/Dataset/Infrared.csv'
data = pd.read_csv(url)
data_encoded = pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,Gender,Age,Ethnicity,T_atm,Humidity,Distance,T_offset1,Max1R13_1,Max1L13_1,aveAllR13_1,...,T_FHRC1,T_FHLC1,T_FHBC1,T_FHTC1,T_FH_Max1,T_FHC_Max1,T_Max1,T_OR1,T_OR_Max1,aveOralM
0,Male,41-50,White,24.0,28.0,0.8,0.7025,35.03,35.3775,34.4,...,33.4775,33.3725,33.4925,33.0025,34.53,34.0075,35.6925,35.635,35.6525,36.59
1,Female,31-40,Black or African-American,24.0,26.0,0.8,0.78,34.55,34.52,33.93,...,34.055,33.6775,33.97,34.0025,34.6825,34.66,35.175,35.0925,35.1075,37.19
2,Female,21-30,White,24.0,26.0,0.8,0.8625,35.6525,35.5175,34.2775,...,34.8275,34.6475,34.82,34.67,35.345,35.2225,35.9125,35.86,35.885,37.34
3,Female,21-30,Black or African-American,24.0,27.0,0.8,0.93,35.2225,35.6125,34.385,...,34.4225,34.655,34.3025,34.9175,35.6025,35.315,35.72,34.965,34.9825,37.09
4,Male,18-20,White,24.0,27.0,0.8,0.895,35.545,35.665,34.91,...,35.16,34.3975,34.67,33.8275,35.4175,35.3725,35.895,35.5875,35.6175,37.04


##Pemisahan Fitur dan Target

In [37]:
X_raw = data_encoded.drop(columns=['aveOralM'])
y = data_encoded['aveOralM']
X_raw.head()

Unnamed: 0,T_atm,Humidity,Distance,T_offset1,Max1R13_1,Max1L13_1,aveAllR13_1,aveAllL13_1,T_RC1,T_RC_Dry1,...,Age_26-30,Age_31-40,Age_41-50,Age_51-60,Age_>60,Ethnicity_Asian,Ethnicity_Black or African-American,Ethnicity_Hispanic/Latino,Ethnicity_Multiracial,Ethnicity_White
0,24.0,28.0,0.8,0.7025,35.03,35.3775,34.4,34.9175,34.985,34.985,...,False,False,True,False,False,False,False,False,False,True
1,24.0,26.0,0.8,0.78,34.55,34.52,33.93,34.225,34.71,34.6325,...,False,True,False,False,False,False,True,False,False,False
2,24.0,26.0,0.8,0.8625,35.6525,35.5175,34.2775,34.8,35.685,35.6675,...,False,False,False,False,False,False,False,False,False,True
3,24.0,27.0,0.8,0.93,35.2225,35.6125,34.385,35.2475,35.2075,35.2,...,False,False,False,False,False,False,True,False,False,False
4,24.0,27.0,0.8,0.895,35.545,35.665,34.91,35.3675,35.6025,35.475,...,False,False,False,False,False,False,False,False,False,True


##Menangani Nilai Kosong (NaN)

In [38]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X_raw)

#Pengecekkan NaN
np.isnan(X).sum()

np.int64(0)

##Split Data Training dan Testing

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Jumlah data train: {len(X_train)}")
print(f"Jumlah data test : {len(X_test)}")

Jumlah data train: 816
Jumlah data test : 204


#Model K-Nearest  Neighbors dan Desicion Tree

In [40]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

y_pred_knn[:5]

array([36.79, 36.58, 36.88, 36.65, 36.78])

In [41]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

y_pred_dt[:5]

array([36.89, 36.99, 36.69, 36.64, 36.79])

##Fungsi Evaluasi Model

In [42]:
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name}")
    print(f"MSE       : {mse:.4f}")
    print(f"RMSE      : {rmse:.4f}")
    print(f"R-Squared : {r2:.4f}")
    print()

##Evaluasi K-Nearest  Neighbors dan Decision Tree

In [43]:
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors")
evaluate_model(y_test, y_pred_dt, "Decision Tree")

K-Nearest Neighbors
MSE       : 0.0922
RMSE      : 0.3036
R-Squared : 0.5622

Decision Tree
MSE       : 0.1264
RMSE      : 0.3556
R-Squared : 0.3996



##Penjelasan Matematika

####Mean Squared Error (MSE)

$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2
$$

- $y_i$: nilai aktual  
- $\hat{y}_i$: nilai prediksi  
- $n$: jumlah sampel  

Rata-rata kuadrat dari selisih antara prediksi dan nilai aktual. Semakin kecil, semakin baik.

---

####Root Mean Squared Error (RMSE)

$$
\text{RMSE} = \sqrt{\text{MSE}} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}
$$

- RMSE adalah akar dari MSE. Memiliki satuan yang sama dengan target dan lebih mudah diinterpretasikan.

---

####R-Squared ($R^2$ Score)

$$
R^2 = 1 - \frac{\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2}
$$

- $\bar{y}$: rata-rata dari nilai aktual  
- Nilai $R^2$ berkisar dari 0 (buruk) sampai 1 (sempurna). Semakin dekat ke 1, model semakin baik dalam menjelaskan variasi data.