## Implementasi Extreme Gradient Boosting (XGB)

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Mengatur agar tampilan plot lebih baik
plt.style.use('ggplot')

In [2]:
# Ganti 'path/to/your/insurance.csv' dengan lokasi file Anda
# Jika file ada di folder yang sama, cukup tulis 'insurance.csv'
file_path = 'insurance.csv'
df = pd.read_csv(file_path)

# Tampilkan 5 baris pertama
print("--- Tampilan 5 Baris Pertama Data ---")
print(df.head())

# Lihat informasi dasar dan tipe data setiap kolom
print("\n--- Info Struktur Data ---")
df.info()

# Lihat statistik deskriptif untuk kolom numerik
print("\n--- Statistik Deskriptif ---")
print(df.describe())

--- Tampilan 5 Baris Pertama Data ---
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520

--- Info Struktur Data ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB

--- Statistik Deskriptif ---


In [3]:
# Terapkan One-Hot Encoding pada kolom kategorikal
df_processed = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

print("--- Data Setelah One-Hot Encoding ---")
print(df_processed.head())

--- Data Setelah One-Hot Encoding ---
   age     bmi  children      charges  sex_male  smoker_yes  region_northwest  \
0   19  27.900         0  16884.92400     False        True             False   
1   18  33.770         1   1725.55230      True       False             False   
2   28  33.000         3   4449.46200      True       False             False   
3   33  22.705         0  21984.47061      True       False              True   
4   32  28.880         0   3866.85520      True       False              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  


In [4]:
# Target (y) adalah apa yang ingin kita prediksi
y = df_processed['charges']

# Fitur (X) adalah semua kolom lain yang digunakan untuk prediksi
X = df_processed.drop('charges', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Ukuran data latih (X_train): {X_train.shape}")
print(f"Ukuran data uji (X_test):   {X_test.shape}")

Ukuran data latih (X_train): (1070, 8)
Ukuran data uji (X_test):   (268, 8)


In [6]:
# Inisialisasi XGBRegressor dengan beberapa hyperparameter umum
# n_estimators: Jumlah pohon yang akan dibangun
# max_depth: Kedalaman maksimum setiap pohon
# learning_rate: Seberapa cepat model belajar
# objective: Fungsi objektif untuk regresi
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)

# Latih model dengan data training
print("--- Melatih Model XGBoost Regressor... ---")
model.fit(X_train, y_train)
print("Model selesai dilatih!")

--- Melatih Model XGBoost Regressor... ---
Model selesai dilatih!
