In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# 1. Load data
print("Memuat dataset parkir...")
df = pd.read_csv("/kaggle/input/setulusdata2/log_parkir_detail.csv")

# 2. Data cleaning
df.dropna(inplace=True)
df = df[df['jam'].between(0, 23)]
df = df[df['status_penuh'].isin([0, 1])]

# Pastikan hari dan jam numerik
df['jam'] = df['jam'].astype(int)

# Konversi hari ke angka
hari_mapping = {
    'Senin': 0, 'Selasa': 1, 'Rabu': 2,
    'Kamis': 3, 'Jumat': 4, 'Sabtu': 5, 'Minggu': 6
}
if df['hari'].dtype == 'object':
    df['hari'] = df['hari'].map(hari_mapping)

# Feature Engineering

# Kategori jam
df['kategori_jam'] = pd.cut(
    df['jam'], bins=[-1, 5, 11, 17, 21, 24],
    labels=['dini_hari', 'pagi', 'siang', 'sore', 'malam']
)

# Area penuh rate
area_rate = df.groupby('area_parkir')['status_penuh'].mean().to_dict()
df['area_penuh_rate'] = df['area_parkir'].map(area_rate)

# Interaksi hari dan jam
df['hari_jam'] = df['hari'] * 100 + df['jam']

# Sinusoidal encoding jam
df['jam_sin'] = np.sin(2 * np.pi * df['jam'] / 24)
df['jam_cos'] = np.cos(2 * np.pi * df['jam'] / 24)

# Akhir pekan
df['is_weekend'] = df['hari'].isin([5, 6]).astype(int)

# Statistik historis
df['freq_penuh_area_hari'] = df.groupby(['area_parkir', 'hari'])['status_penuh'].transform('mean')
df['freq_penuh_area_jam'] = df.groupby(['area_parkir', 'jam'])['status_penuh'].transform('mean')
df['volume_area_hari'] = df.groupby(['area_parkir', 'hari'])['status_penuh'].transform('count')

# Label dan fitur
features = [
    'hari', 'jam', 'kategori_jam', 'area_parkir',
    'area_penuh_rate', 'hari_jam', 'jam_sin', 'jam_cos',
    'is_weekend', 'freq_penuh_area_hari',
    'freq_penuh_area_jam', 'volume_area_hari'
]
target = 'status_penuh'

X = df[features]
y = df[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Model CatBoost
cat_features = ['kategori_jam', 'area_parkir']

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    random_seed=42,
    cat_features=cat_features,
    verbose=100,
    early_stopping_rounds=50
)

# Train model
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"\nAkurasi: {accuracy * 100:.2f}%")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#  Simpan model dalam format .cbm (CatBoost format)
model.save_model("model_prediksi_parkir_catboost.cbm")
print("\n Model berhasil disimpan sebagai 'model_prediksi_parkir_catboost.cbm'")


Memuat dataset parkir...
0:	test: 0.7615768	best: 0.7615768 (0)	total: 69.5ms	remaining: 34.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7756381916
bestIteration = 39

Shrink model to first 40 iterations.

Akurasi: 71.60%
F1 Score: 0.6559
ROC AUC: 0.7756

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.81      0.76       828
           1       0.72      0.60      0.66       672

    accuracy                           0.72      1500
   macro avg       0.72      0.71      0.71      1500
weighted avg       0.72      0.72      0.71      1500


 Model berhasil disimpan sebagai 'model_prediksi_parkir_catboost.cbm'
