In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/imbich/Project-Machine-Learning/refs/heads/main/data/dataset.csv')
df.rename(columns = {'Nacionality':'Nationality'}, inplace = True)
df['Target'] = df['Target'].map({
    'Dropout':0,
    'Enrolled':1,
    'Graduate':2
})

data = df.drop(['Target'], axis = 1)
label = df['Target']

In [3]:
# Chia dữ liệu
X_train, X_val, y_train, y_val = train_test_split(data, label, test_size=0.3, random_state=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [4]:
# Mô hình phân loại KNN
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_scaled, y_train)

print("KNN Classifier")

# Đánh giá trên tập train
y_train_pred = knn.predict(X_train_scaled)
print("Train Accuracy:", round(accuracy_score(y_train, y_train_pred), 2))
print("Train Classification Report:\n", classification_report(y_train, y_train_pred))

# Đánh giá trên tập validation
y_val_pred = knn.predict(X_val_scaled)
print("Validation Accuracy:", round(accuracy_score(y_val, y_val_pred), 2))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))


KNN Classifier
Train Accuracy: 0.74
Train Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.69      0.77       993
           1       0.58      0.24      0.34       561
           2       0.71      0.96      0.82      1542

    accuracy                           0.74      3096
   macro avg       0.73      0.63      0.64      3096
weighted avg       0.74      0.74      0.72      3096

Validation Accuracy: 0.72
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.72       428
           1       0.48      0.21      0.30       233
           2       0.71      0.95      0.81       667

    accuracy                           0.72      1328
   macro avg       0.67      0.60      0.61      1328
weighted avg       0.70      0.72      0.69      1328



In [5]:
# Softmax regression
log_reg = LogisticRegression(max_iter=5000, random_state=0)
log_reg.fit(X_train, y_train)

print("Logistic Regression Classifier")

# Đánh giá trên tập train
y_train_pred = log_reg.predict(X_train)
print("Train Accuracy:", round(accuracy_score(y_train, y_train_pred), 2))
print("Train Classification Report:\n", classification_report(y_train, y_train_pred))

# Đánh giá trên tập validation
y_val_pred = log_reg.predict(X_val)
print("Validation Accuracy:", round(accuracy_score(y_val, y_val_pred), 2))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Logistic Regression Classifier
Train Accuracy: 0.78
Train Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.79      0.80       993
           1       0.57      0.33      0.41       561
           2       0.79      0.93      0.86      1542

    accuracy                           0.78      3096
   macro avg       0.73      0.68      0.69      3096
weighted avg       0.76      0.78      0.76      3096

Validation Accuracy: 0.78
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.77      0.79       428
           1       0.59      0.36      0.44       233
           2       0.80      0.94      0.87       667

    accuracy                           0.78      1328
   macro avg       0.73      0.69      0.70      1328
weighted avg       0.77      0.78      0.77      1328

