In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import joblib
import os

In [2]:
url = "https://drive.google.com/uc?export=download&id=11lMuTGycjsA7i4soyqBjYTC-64pUdyxV"
df = pd.read_csv(url, delimiter=',')
print("Data berhasil dimuat.")

Data berhasil dimuat.


In [3]:
data = df.copy()
label_encoders = {}
for col in data.columns:
    if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
print("Proses Label Encoding selesai.")

Proses Label Encoding selesai.


In [4]:
X = data.drop(columns=['Recurred'])
y = data['Recurred']
print("Pemisahan Fitur (X) dan Target (y) selesai.")

Pemisahan Fitur (X) dan Target (y) selesai.


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Konversi kembali ke DataFrame untuk menjaga nama kolom
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
print("Proses Scaling hanya pada fitur (X) selesai.")

Proses Scaling hanya pada fitur (X) selesai.


In [6]:
feature_names = X.columns.tolist()
categorical_cols_original = df.drop(columns=['Recurred']).select_dtypes(include='object').columns.tolist()
feature_info = {
    'feature_names': feature_names,
    'categorical_cols': categorical_cols_original
}
print("Informasi fitur siap.")

Informasi fitur siap.


In [7]:
# Dataset Murni (Original/Imbalanced)
X_train_murni, X_test_murni, y_train_murni, y_test_murni = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42, stratify=y)

# Dataset Oversampling (SMOTE)
smote = SMOTE(random_state=42)
X_train_over, y_train_over = smote.fit_resample(X_train_murni, y_train_murni)

# Dataset Undersampling (Random)
rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(X_train_murni, y_train_murni)

print("Semua skenario dataset (Murni, Oversampling, Undersampling) telah disiapkan.")

Semua skenario dataset (Murni, Oversampling, Undersampling) telah disiapkan.


In [8]:
# Dictionary untuk menyimpan data training setiap skenario
datasets_to_train = {
    "murni": (X_train_murni, y_train_murni),
    "oversampling": (X_train_over, y_train_over),
    "undersampling": (X_train_under, y_train_under)
}

# Model yang akan dilatih
models_to_train = {
    "model_rf": RandomForestClassifier(n_estimators=100, random_state=42),
    "model_lr": LogisticRegression(max_iter=1000, random_state=42),
    "model_knn": KNeighborsClassifier(n_neighbors=5)
}

# Loop utama untuk training dan saving
for dataset_name, (X_train, y_train) in datasets_to_train.items():
    save_dir = os.path.join('model-v2', dataset_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print(f"\n--- Melatih model untuk dataset: {dataset_name} ---")

    for model_name, model_instance in models_to_train.items():
        # Latih model
        model_instance.fit(X_train, y_train)

        # Simpan model
        file_path = os.path.join(save_dir, f'{model_name}.joblib')
        joblib.dump(model_instance, file_path)
        print(f"Model '{model_name}' disimpan di '{file_path}'")


--- Melatih model untuk dataset: murni ---
Model 'model_rf' disimpan di 'model-v2/murni/model_rf.joblib'
Model 'model_lr' disimpan di 'model-v2/murni/model_lr.joblib'
Model 'model_knn' disimpan di 'model-v2/murni/model_knn.joblib'

--- Melatih model untuk dataset: oversampling ---
Model 'model_rf' disimpan di 'model-v2/oversampling/model_rf.joblib'
Model 'model_lr' disimpan di 'model-v2/oversampling/model_lr.joblib'
Model 'model_knn' disimpan di 'model-v2/oversampling/model_knn.joblib'

--- Melatih model untuk dataset: undersampling ---
Model 'model_rf' disimpan di 'model-v2/undersampling/model_rf.joblib'
Model 'model_lr' disimpan di 'model-v2/undersampling/model_lr.joblib'
Model 'model_knn' disimpan di 'model-v2/undersampling/model_knn.joblib'


In [9]:
preprocessor_dir = 'model-v2'
if not os.path.exists(preprocessor_dir):
    os.makedirs(preprocessor_dir)

joblib.dump(scaler, os.path.join(preprocessor_dir, 'scaler.joblib'))
joblib.dump(label_encoders, os.path.join(preprocessor_dir, 'label_encoders.joblib'))
joblib.dump(feature_info, os.path.join(preprocessor_dir, 'feature_info.joblib'))

print("\n--- Preprocessors (scaler, label_encoders, feature_info) berhasil disimpan. ---")
print("\nSemua proses selesai!")


--- Preprocessors (scaler, label_encoders, feature_info) berhasil disimpan. ---

Semua proses selesai!


In [11]:
import json
from sklearn.metrics import classification_report, accuracy_score

print("Mulai menghitung akurasi untuk semua model...")

all_accuracies = {}

# Loop melalui setiap jenis dataset training
for dataset_name in datasets_to_train.keys():
    all_accuracies[dataset_name] = {}
    print(f"\nMengevaluasi model dari dataset '{dataset_name}'...")

    for model_key, model_display_name in [("model_rf", "Random Forest"), ("model_lr", "Logistic Regression"), ("model_knn", "K-NN (K-Nearest Neighbors)")]:
        model_path = os.path.join('models', dataset_name, f'{model_key}.joblib')
        model = joblib.load(model_path)

        y_pred = model.predict(X_test_murni)

        # Hitung akurasi
        accuracy = accuracy_score(y_test_murni, y_pred)
        
        all_accuracies[dataset_name][model_display_name] = accuracy
        
        print(f"  - Akurasi {model_display_name}: {accuracy:.4f}")

# Simpan dictionary akurasi ke dalam file JSON
accuracy_file_path = os.path.join('model-v2', 'accuracies.json')
with open(accuracy_file_path, 'w') as f:
    json.dump(all_accuracies, f, indent=4)

print(f"\nFile akurasi berhasil disimpan di: '{accuracy_file_path}'")

Mulai menghitung akurasi untuk semua model...

Mengevaluasi model dari dataset 'murni'...
  - Akurasi Random Forest: 0.7792
  - Akurasi Logistic Regression: 0.8701
  - Akurasi K-NN (K-Nearest Neighbors): 0.7143

Mengevaluasi model dari dataset 'oversampling'...
  - Akurasi Random Forest: 1.0000
  - Akurasi Logistic Regression: 0.9221
  - Akurasi K-NN (K-Nearest Neighbors): 0.9221

Mengevaluasi model dari dataset 'undersampling'...
  - Akurasi Random Forest: 0.9740
  - Akurasi Logistic Regression: 0.8831
  - Akurasi K-NN (K-Nearest Neighbors): 0.9351

File akurasi berhasil disimpan di: 'model-v2/accuracies.json'


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
