COMPARATIVE MODEL

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import matthews_corrcoef, cohen_kappa_score, classification_report

In [3]:
data = pd.read_csv('../dataset/cdc_diabetes_health_indicators.csv')

# Preprocessing
X = data.drop('target', axis=1)  # Menggunakan 'target' sebagai nama kolom target
if 'ID' in X.columns:
    X = X.drop('ID', axis=1)
y = data['target']  # Menggunakan 'target' sebagai nama kolom target

# Normalisasi data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42, stratify=y)


In [4]:
# Definisikan berbagai model untuk perbandingan
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM (Linear)': SVC(kernel='linear', probability=True, random_state=42),
    'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'MLP Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}

# Dictionary untuk menyimpan hasil
results = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': [],
    'ROC AUC': [],
    'MCC': [],
    'Kappa': []
}

In [None]:
# Evaluasi semua model
print("=== Evaluasi Berbagai Model untuk Perbandingan ===")
for name, model in models.items():
    print(f"\nEvaluasi model: {name}")

    # Train model
    model.fit(X_train, y_train)

    # Evaluasi pada test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    # Hitung metrik
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    # Simpan hasil
    results['Model'].append(name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1-Score'].append(f1)
    results['ROC AUC'].append(roc_auc)
    results['MCC'].append(mcc)
    results['Kappa'].append(kappa)

    # Tampilkan
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"MCC: {mcc:.4f}")
    print(f"Kappa: {kappa:.4f}")

=== Evaluasi Berbagai Model untuk Perbandingan ===

Evaluasi model: Logistic Regression
Accuracy: 0.8629
Precision: 0.5267
Recall: 0.1574
F1-Score: 0.2424
ROC AUC: 0.8215
MCC: 0.2332
Kappa: 0.1905

Evaluasi model: Random Forest
Accuracy: 0.8596
Precision: 0.4892
Recall: 0.1763
F1-Score: 0.2592
ROC AUC: 0.7964
MCC: 0.2323
Kappa: 0.2001

Evaluasi model: SVM (Linear)


In [None]:
# Buat DataFrame hasil
results_df = pd.DataFrame(results)
print("\n=== Hasil Perbandingan Model ===")
print(results_df)

# Simpan hasil perbandingan
results_df.to_csv('perbandingan_model_untuk_jurnal.csv', index=False)

# Buat tabel untuk jurnal
with open('table_perbandingan_untuk_jurnal.txt', 'w') as f:
    f.write("Table X: Comparison of different machine learning models for diabetes prediction\n\n")
    f.write("| Model | Accuracy | Precision | Recall | F1-Score | ROC AUC | MCC | Kappa |\n")
    f.write("|-------|----------|-----------|--------|----------|---------|-----|-------|\n")

    for idx, row in results_df.iterrows():
        f.write(f"| {row['Model']} | {row['Accuracy']:.4f} | {row['Precision']:.4f} | ")
        f.write(f"{row['Recall']:.4f} | {row['F1-Score']:.4f} | {row['ROC AUC']:.4f} | ")
        f.write(f"{row['MCC']:.4f} | {row['Kappa']:.4f} |\n")

    f.write("\n*Note: All models were trained on the same preprocessed dataset with standardized features.")

In [None]:
# Buat visualisasi untuk jurnal
plt.figure(figsize=(14, 10))

# Plot accuracy
plt.subplot(221)
sns.barplot(x='Accuracy', y='Model', data=results_df.sort_values('Accuracy'))
plt.title('Accuracy Comparison')
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Plot F1-Score
plt.subplot(222)
sns.barplot(x='F1-Score', y='Model', data=results_df.sort_values('F1-Score'))
plt.title('F1-Score Comparison')
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Plot ROC AUC
plt.subplot(223)
sns.barplot(x='ROC AUC', y='Model', data=results_df.sort_values('ROC AUC'))
plt.title('ROC AUC Comparison')
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Plot MCC
plt.subplot(224)
sns.barplot(x='MCC', y='Model', data=results_df.sort_values('MCC'))
plt.title('Matthews Correlation Coefficient (MCC) Comparison')
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('model_comparison_charts.png', dpi=300, bbox_inches='tight')
plt.close()

# Analisis lebih lanjut - radar chart untuk visual jurnal
plt.figure(figsize=(10, 8))

# Persiapkan data untuk radar chart
models_for_radar = results_df['Model'].tolist()
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']

# Jumlah metrik dan model
angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False).tolist()
angles += angles[:1]  # Tutup lingkaran

# Plot untuk setiap model
ax = plt.subplot(111, polar=True)
for i, model in enumerate(models_for_radar):
    values = results_df.loc[i, metrics].tolist()
    values += values[:1]  # Tutup lingkaran
    ax.plot(angles, values, linewidth=2, label=model)
    ax.fill(angles, values, alpha=0.1)

# Atur konfigurasi radar chart
plt.xticks(angles[:-1], metrics)
plt.yticks([0.2, 0.4, 0.6, 0.8, 1.0], ['0.2', '0.4', '0.6', '0.8', '1.0'], color='gray')
plt.ylim(0, 1)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.title('Model Performance Comparison - Radar Chart', size=15)
plt.tight_layout()
plt.savefig('model_comparison_radar.png', dpi=300, bbox_inches='tight')
plt.close()

print("\nPerbandingan model selesai! Hasil telah disimpan untuk jurnal.")
print("File yang dihasilkan:")
print("1. perbandingan_model_untuk_jurnal.csv")
print("2. table_perbandingan_untuk_jurnal.txt")
print("3. model_comparison_charts.png")
print("4. model_comparison_radar.png")