In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = np.loadtxt("../data/processed/processed_data.csv", delimiter=",", skiprows=1)

# Tách features và target
X_raw = data[:, :-1]
y = data[:, -1]

In [3]:

# Train/test split
def train_test_split(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    split = int(X.shape[0]*(1-test_size))
    train_idx, test_idx = indices[:split], indices[split:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

X_train, X_test, y_train, y_test = train_test_split(X_raw, y)


In [4]:

# Logistic Regression 
def sigmoid(z):
    return 1 / (1 + np.exp(-z))
# Thêm bias term
X_train_bias = np.hstack([np.ones((X_train.shape[0],1)), X_train])
X_test_bias = np.hstack([np.ones((X_test.shape[0],1)), X_test])

In [5]:
# Gradient descent
theta = np.zeros(X_train_bias.shape[1])
lr = 0.01
epochs = 5000
y_train_reshaped = y_train.reshape(-1,1)

for _ in range(epochs):
    z = X_train_bias @ theta.reshape(-1,1)
    h = sigmoid(z)
    gradient = (X_train_bias.T @ (h - y_train_reshaped)) / y_train_reshaped.size
    theta -= lr * gradient.flatten()
# Dự đoán
y_prob_test = sigmoid(X_test_bias @ theta.reshape(-1,1))
y_pred_test = (y_prob_test >= 0.5).astype(int)
y_prob_train = sigmoid(X_train_bias @ theta.reshape(-1,1))
y_pred_train = (y_prob_train >= 0.5).astype(int)

In [6]:
# KNN
def knn_predict(X_train, y_train, X_test, k=5):
    y_pred = []
    for x in X_test:
        distances = np.sqrt(np.sum((X_train - x)**2, axis=1))
        idx = np.argsort(distances)[:k]
        pred = np.round(np.mean(y_train[idx]))
        y_pred.append(pred)
    return np.array(y_pred)

y_pred_knn = knn_predict(X_train, y_train, X_test, k=5)

#  Đánh giá mô hình
def evaluate(y_true, y_pred):
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    accuracy = np.mean(y_true == y_pred)
    tp = np.sum((y_true==1) & (y_pred==1))
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))
    
    precision = tp / (tp + fp + 1e-9)
    recall = tp / (tp + fn + 1e-9)
    f1 = 2 * precision * recall / (precision + recall + 1e-9)
    return accuracy, precision, recall, f1


In [7]:

# Logistic Regression
acc_train, prec_train, rec_train, f1_train = evaluate(y_train, y_pred_train)
acc_test, prec_test, rec_test, f1_test = evaluate(y_test, y_pred_test)
print("Logistic Regression")
print("Train Acc:", acc_train, "F1:", f1_train)
print("Test Acc:", acc_test, "F1:", f1_test)

# KNN
acc_knn, prec_knn, rec_knn, f1_knn = evaluate(y_test, y_pred_knn.reshape(-1,1))
print("\nKNN")
print("Test Acc:", acc_knn, "F1:", f1_knn)


Logistic Regression
Train Acc: 0.7543390317108182 F1: 0.0947343109855291
Test Acc: 0.7554801670146137 F1: 0.0964320152752218

KNN
Test Acc: 0.7492171189979123 F1: 0.44928366712520657


- Logistic Regression: Accuracy khoảng 75%, F1 rất thấp → dự đoán gần như toàn nhãn 0.
- KNN: Accuracy ~75%, F1 ~0.44 → dự đoán nhãn 1 tốt hơn, cân bằng hơn giữa precision và recall.