<a href="https://colab.research.google.com/github/keripikkaneboo/Machine-Learning/blob/main/07.%20Week%207/TugasWeek7_Booster%26Bagging1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
def evaluate_models(X, y, dataset_name):
    """
    Melatih dan mengevaluasi model regresi pada dataset yang diberikan.
    """
    if X.empty or y.empty:
        print(f"Skipping {dataset_name}: Data kosong setelah pra-pemrosesan.")
        return

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "AdaBoost": AdaBoostRegressor(random_state=42),
        "Random Forest": RandomForestRegressor(random_state=42),
        "Bagging Regressor": BaggingRegressor(random_state=42)
    }

    print(f"\n--- Hasil Evaluasi untuk Dataset: {dataset_name} ---")
    for name, model in models.items():
        try:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)

            print(f"\nModel: {name}")
            print(f"  MSE: {mse:.4f}")
            print(f"  RMSE: {rmse:.4f}")
            print(f"  R-squared: {r2:.4f}")
        except Exception as e:
            print(f"Error saat melatih/mengevaluasi {name} di {dataset_name}: {e}")

In [3]:
# --- 1. Dataset Automobile ---
print("Memproses Dataset Automobile...")
try:
    df_auto = pd.read_csv("https://raw.githubusercontent.com/farrelrassya/teachingMLDL/refs/heads/main/01.%20Machine%20Learning/01.%20Week%201/Dataset/Automobile.csv")
    df_auto.replace('?', np.nan, inplace=True)

    # Target: price
    # Pra-pemrosesan sederhana
    cols_to_numeric = ['price', 'horsepower', 'peak-rpm', 'bore', 'stroke', 'normalized-losses']
    for col in cols_to_numeric:
        if col in df_auto.columns:
            df_auto[col] = pd.to_numeric(df_auto[col], errors='coerce')

    df_auto.dropna(subset=['price'], inplace=True) # Hapus baris jika target (price) NaN
    y_auto = df_auto['price']
    X_auto = df_auto.drop('price', axis=1)

    # Fitur Numerik
    num_features_auto = X_auto.select_dtypes(include=np.number).columns
    imputer_num_auto = SimpleImputer(strategy='mean')
    X_auto[num_features_auto] = imputer_num_auto.fit_transform(X_auto[num_features_auto])

    # Fitur Kategorikal (gunakan get_dummies untuk kesederhanaan)
    cat_features_auto = X_auto.select_dtypes(include='object').columns
    X_auto = pd.get_dummies(X_auto, columns=cat_features_auto, drop_first=True, dummy_na=False) # dummy_na=False untuk tidak membuat kolom NaN eksplisit setelah imputasi mode (jika ada)

    # Scaling
    scaler_auto = StandardScaler()
    X_auto_scaled = scaler_auto.fit_transform(X_auto)

    evaluate_models(pd.DataFrame(X_auto_scaled, columns=X_auto.columns), y_auto, "Automobile")

except Exception as e:
    print(f"Error memproses dataset Automobile: {e}")

Memproses Dataset Automobile...

--- Hasil Evaluasi untuk Dataset: Automobile ---

Model: Gradient Boosting
  MSE: 5859379.7253
  RMSE: 2420.6156
  R-squared: 0.9521

Model: AdaBoost
  MSE: 9620201.0835
  RMSE: 3101.6449
  R-squared: 0.9214

Model: Random Forest
  MSE: 7762927.2459
  RMSE: 2786.2030
  R-squared: 0.9365

Model: Bagging Regressor
  MSE: 10938948.2924
  RMSE: 3307.4081
  R-squared: 0.9106


In [4]:
# --- 2. Dataset Boston Housing ---
print("\nMemproses Dataset Boston Housing...")
try:
    df_boston = pd.read_csv("https://raw.githubusercontent.com/keripikkaneboo/Machine-Learning/refs/heads/main/02.%20Week%202/BostonHousing.csv")

    # Cek jika kolom target ada, bisa 'medv' atau 'MEDV'
    target_col_boston = None
    if 'medv' in df_boston.columns:
        target_col_boston = 'medv'
    elif 'MEDV' in df_boston.columns:
        target_col_boston = 'MEDV'
    elif df_boston.shape[1] == 14: # Asumsi kolom terakhir adalah target jika nama standar tidak ada
        target_col_boston = df_boston.columns[-1]
        print(f"Menggunakan kolom terakhir '{target_col_boston}' sebagai target untuk Boston Housing.")

    if target_col_boston:
        y_boston = df_boston[target_col_boston]
        X_boston = df_boston.drop(target_col_boston, axis=1)

        # Semua fitur di Boston Housing umumnya numerik
        imputer_boston = SimpleImputer(strategy='mean')
        X_boston_imputed = imputer_boston.fit_transform(X_boston)

        scaler_boston = StandardScaler()
        X_boston_scaled = scaler_boston.fit_transform(X_boston_imputed)

        evaluate_models(pd.DataFrame(X_boston_scaled, columns=X_boston.columns), y_boston, "Boston Housing")
    else:
        print("Kolom target ('medv' atau 'MEDV') tidak ditemukan di dataset Boston Housing.")

except Exception as e:
    print(f"Error memproses dataset Boston Housing: {e}")


Memproses Dataset Boston Housing...

--- Hasil Evaluasi untuk Dataset: Boston Housing ---

Model: Gradient Boosting
  MSE: 6.2082
  RMSE: 2.4916
  R-squared: 0.9153

Model: AdaBoost
  MSE: 13.1122
  RMSE: 3.6211
  R-squared: 0.8212

Model: Random Forest
  MSE: 7.9271
  RMSE: 2.8155
  R-squared: 0.8919

Model: Bagging Regressor
  MSE: 9.9224
  RMSE: 3.1500
  R-squared: 0.8647


In [9]:
# --- Memproses Dataset Infrared ---
print("\nMemproses Dataset Infrared...")

try:
    url = "https://raw.githubusercontent.com/keripikkaneboo/Machine-Learning/refs/heads/main/03.%20Week%203/Infrared.csv"
    df_infrared = pd.read_csv(url)

    # Gunakan kolom terakhir sebagai target
    target_col = df_infrared.columns[-1]
    print(f"🎯 Menggunakan kolom '{target_col}' sebagai target.")

    y = pd.to_numeric(df_infrared[target_col], errors='coerce')
    X = df_infrared.drop(columns=[target_col])

    # Buang baris dengan target NaN
    valid_rows = y.notna()
    y = y[valid_rows]
    X = X.loc[valid_rows]

    # Konversi semua kolom fitur ke numerik
    for col in X.columns:
        if not pd.api.types.is_numeric_dtype(X[col]):
            X[col] = pd.to_numeric(X[col], errors='coerce')

    # Hapus kolom yang semua nilainya NaN
    all_nan_cols = X.columns[X.isna().all()].tolist()
    if all_nan_cols:
        print(f"Menghapus kolom yang semuanya NaN: {all_nan_cols}")
        X = X.drop(columns=all_nan_cols)

    if X.empty or X.shape[1] == 0:
        print("Tidak ada fitur valid yang tersisa. Dataset Infrared dilewati.")
    else:
        # Imputasi nilai yang hilang dengan rata-rata
        imputer = SimpleImputer(strategy='mean')
        X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

        # Normalisasi dengan StandardScaler
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns, index=X.index)

        # Evaluasi model
        evaluate_models(X_scaled, y, dataset_name=f"Infrared (Target: {target_col})")

except Exception as e:
    print(f"Error saat memproses dataset Infrared: {e}")


Memproses Dataset Infrared...
🎯 Menggunakan kolom 'aveOralM' sebagai target.
Menghapus kolom yang semuanya NaN: ['Gender', 'Age', 'Ethnicity']

--- Hasil Evaluasi untuk Dataset: Infrared (Target: aveOralM) ---

Model: Gradient Boosting
  MSE: 0.0520
  RMSE: 0.2280
  R-squared: 0.7531

Model: AdaBoost
  MSE: 0.0607
  RMSE: 0.2463
  R-squared: 0.7120

Model: Random Forest
  MSE: 0.0580
  RMSE: 0.2408
  R-squared: 0.7246

Model: Bagging Regressor
  MSE: 0.0643
  RMSE: 0.2536
  R-squared: 0.6945


Berikut adalah **penjelasan setiap persamaan matematika** yang digunakan dalam kode tersebut, khususnya pada bagian evaluasi model:

---

### 1. **Mean Squared Error (MSE)**

```python
mse = mean_squared_error(y_test, y_pred)
```

**Persamaan:**

$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n}(y_i - \hat{y}_i)^2
$$

**Penjelasan:**

* $y_i$ = nilai aktual ke-i
* $\hat{y}_i$ = nilai prediksi ke-i
* $n$ = jumlah data
* Mengukur **rata-rata dari kuadrat selisih** antara nilai aktual dan prediksi.
* MSE memberikan penalti besar untuk kesalahan besar → cocok untuk kasus di mana outlier penting.

---

### 2. **Root Mean Squared Error (RMSE)**

```python
rmse = np.sqrt(mse)
```

**Persamaan:**

$$
\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n}(y_i - \hat{y}_i)^2} = \sqrt{\text{MSE}}
$$

**Penjelasan:**

* Akar dari MSE → satuan RMSE sama seperti target asli $y$.
* Lebih mudah diinterpretasikan dalam konteks real-world.
* Masih sensitif terhadap outlier, tapi lebih intuitif dibanding MSE.

---

### 3. **R-squared (R²)**

```python
r2 = r2_score(y_test, y_pred)
```

**Persamaan:**

$$
R^2 = 1 - \frac{\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2}
$$

**Penjelasan:**

* $\bar{y}$ = rata-rata dari nilai aktual
* Menunjukkan **seberapa besar variasi data** yang bisa dijelaskan oleh model
* Nilai:

  * $R^2 = 1$: model sempurna
  * $R^2 = 0$: model tidak lebih baik dari rata-rata
  * $R^2 < 0$: model lebih buruk dari model rata-rata

