##Import Library dan Load Dataset

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

url = 'https://raw.githubusercontent.com/farrelrassya/teachingMLDL/main/02.%20Deep%20Learning/Dataset/income.csv'
df = pd.read_csv(url)

print("Kolom tersedia:", df.columns.tolist())

Kolom tersedia: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']


##Pengecekkan dan Pembersihan Data

In [25]:
df.replace(' ?', np.nan, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

##Label Encoding untuk Fitur Kategorikal

In [26]:
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

##Pisahkan Fitur dan Target

In [27]:
X = df.drop('income', axis=1)
y = df['income']

##Split Data (Training dan Testing)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##Normalisasi Fitur

In [29]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

##Membuat dan Melatih Model

In [32]:
models = {
    'K-NN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n{name}")
    print(f"Akurasi : {accuracy_score(y_test, y_pred):.2f}")
    print(f"Presisi : {precision_score(y_test, y_pred, average='macro'):.2f}")
    print(f"Recall  : {recall_score(y_test, y_pred, average='macro'):.2f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='macro'):.2f}")


K-NN
Akurasi : 0.51
Presisi : 0.39
Recall  : 0.36
F1 Score: 0.36

Decision Tree
Akurasi : 0.46
Presisi : 0.38
Recall  : 0.38
F1 Score: 0.38


##Penjelasan Matematika

### 1. Akurasi (Accuracy)
$$
Accuracy = \\frac{TP + TN}{TP + TN + FP + FN}
$$
- **TP** = True Positive, **TN** = True Negative  
- **FP** = False Positive, **FN** = False Negative  
- Mengukur seberapa banyak prediksi benar dibanding total data.

### 2. Presisi (Precision)
$$
Precision = \\frac{TP}{TP + FP}
$$
- Dari semua yang diprediksi positif, berapa banyak yang benar-benar positif.

### 3. Recall (Sensitivity)
$$
Recall = \\frac{TP}{TP + FN}
$$
- Dari semua data yang benar-benar positif, berapa banyak yang berhasil diprediksi.

### 4. F1-Score
$$
F1 = 2 \\cdot \\frac{Precision \\cdot Recall}{Precision + Recall}
$$
- Rata-rata harmonik antara presisi dan recall. Cocok untuk data yang tidak seimbang.

### 5. AUC (Area Under Curve)
- Luas di bawah kurva ROC.
- Nilai AUC mendekati 1 berarti model sangat baik.

### 6. ROC Curve
- Grafik antara True Positive Rate dan False Positive Rate.
- Model yang bagus memiliki kurva yang naik cepat ke atas kiri grafik.
