In [114]:
import pandas as pd
import numpy as np

df = pd.read_csv("predict_students_dropout_and_academic_success.csv", sep = ";")
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


### *Preprocessing* ###

In [98]:
df.dtypes

Marital status                                      int64
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance\t                        int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                         int64
Mother's qualification                              int64
Father's qualification                              int64
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Displaced                                           int64
Educational special needs                           int64
Debtor                                              int64
Tuition fees up to date                             int64
Gender        

In [99]:
df.isna().sum(axis=0)

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance\t                      0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

In [115]:
def encode_target(value: str) -> int:
    if value == "Dropout":
        return [1,0,0]
    elif value == "Enrolled":
        return [0,1,0]
    else: return [0,0,1]

In [119]:
df["Target"] = df["Target"].apply(encode_target)

In [120]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,"[0, 0, 1]"
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,"[0, 0, 1]"
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,"[0, 0, 1]"


### *Normalization* ###

In [121]:
course_unique = sorted(df["Course"].unique())
course_mapping = {course: i for i,course in enumerate(course_unique)}

df["Course"] = df["Course"].map(course_mapping)
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,1,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
1,1,15,1,10,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,"[0, 0, 1]"
2,1,1,5,4,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,"[1, 0, 0]"
3,1,17,2,14,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,"[0, 0, 1]"
4,2,39,1,2,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,"[0, 0, 1]"


In [122]:
continuous_columns = ["Previous qualification (grade)", "Admission grade",
                      "Curricular units 1st sem (grade)", "Curricular units 2nd sem (grade)",
                      "Unemployment rate","Inflation rate", "GDP"]

for column in continuous_columns:
    df[column] = (df[column] - df[column].mean())/(df[column].std())

df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,1,1,1,-0.80475,1,19,12,...,0,0,0,0,-1.963267,0,-0.287606,0.124372,0.765674,"[1, 0, 0]"
1,1,15,1,10,1,1,2.076585,1,1,3,...,0,6,6,6,0.659487,0,0.876123,-1.105097,0.34716,"[0, 0, 1]"
2,1,1,5,4,1,1,-0.80475,1,37,37,...,0,6,0,0,-1.963267,0,-0.287606,0.124372,0.765674,"[1, 0, 0]"
3,1,17,2,14,1,1,-0.80475,1,38,37,...,0,6,10,5,0.416403,0,-0.813161,-1.466705,-1.375356,"[0, 0, 1]"
4,2,39,1,2,0,1,-2.472892,1,37,38,...,0,6,6,6,0.531548,0,0.876123,-1.105097,0.34716,"[0, 0, 1]"


### *Feature Engineering* ###

In [123]:
num_features = len(df.columns) - 1
cov = np.zeros((num_features,num_features))

cov = df.iloc[:, :-1].corr()

In [124]:
correlation_pairs = cov.unstack()

strong_correlations = correlation_pairs[(correlation_pairs >= 0.8) & (correlation_pairs < 1)]

strong_correlations = strong_correlations.reset_index()
strong_correlations.columns = ["Feature 1" ,"Feature 2", "Correlation"]
strong_correlations = strong_correlations[strong_correlations["Feature 1"] < strong_correlations["Feature 2"]]

print(strong_correlations)

                             Feature 1                            Feature 2  \
1                  Father's occupation                  Mother's occupation   
2  Curricular units 1st sem (credited)  Curricular units 2nd sem (credited)   
3  Curricular units 1st sem (enrolled)  Curricular units 2nd sem (enrolled)   
4  Curricular units 1st sem (approved)  Curricular units 2nd sem (approved)   
5     Curricular units 1st sem (grade)     Curricular units 2nd sem (grade)   

   Correlation  
1     0.910472  
2     0.944811  
3     0.942627  
4     0.904002  
5     0.837170  


In [125]:
columns_to_drop = ["Father's occupation", "Curricular units 1st sem (credited)",
                   "Curricular units 1st sem (enrolled)", "Curricular units 1st sem (approved)",
                   "Curricular units 1st sem (grade)"]

df_reduced = df.drop(columns = columns_to_drop)
df_reduced.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,1,1,1,-0.80475,1,19,12,...,0,0,0,0,-1.963267,0,-0.287606,0.124372,0.765674,"[1, 0, 0]"
1,1,15,1,10,1,1,2.076585,1,1,3,...,0,6,6,6,0.659487,0,0.876123,-1.105097,0.34716,"[0, 0, 1]"
2,1,1,5,4,1,1,-0.80475,1,37,37,...,0,6,0,0,-1.963267,0,-0.287606,0.124372,0.765674,"[1, 0, 0]"
3,1,17,2,14,1,1,-0.80475,1,38,37,...,0,6,10,5,0.416403,0,-0.813161,-1.466705,-1.375356,"[0, 0, 1]"
4,2,39,1,2,0,1,-2.472892,1,37,38,...,0,6,6,6,0.531548,0,0.876123,-1.105097,0.34716,"[0, 0, 1]"


In [126]:
df_reduced.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Admission grade', 'Displaced',
       'Educational special needs', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment', 'International',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)', 'Unemployment rate',
       'Inflation rate', 'GDP', 'Target'],
      dtype='object')

### *Softmax Regression* ###

In [128]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

class SoftmaxRegression:
    def __init__(self, epoch: int, lr: float) -> None:
        self.epoch = epoch
        self.lr = lr
        self.losses = []
        self.metrics = []

    def softmax(self, z: np.ndarray) -> np.ndarray:
        z_max = np.max(z, axis=1, keepdims=True)
        exp_z = np.exp(z - z_max).astype(float)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def loss_fn(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        eps = 1e-9
        return -(y * np.log(y_hat + eps)).sum(axis=1).mean()

    def accuracy(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        return (y.argmax(axis=1) == y_hat.argmax(axis=1)).mean()

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        n, d = X.shape
        n_classes = y.shape[1]

        X = np.hstack([np.ones((n, 1)), X])  
        self.theta = np.zeros((d + 1, n_classes))

        with tqdm(range(self.epoch)) as pb:
            for e in pb:
                pb.set_description(f"Epoch {e+1}")
                s = self.predict(X)

                s = np.array(s)

                if s.shape != y.shape:
                    raise ValueError(f"Shape mismatch: s.shape = {s.shape}, y.shape = {y.shape}")
                gradient = (1/n) * np.matmul(X.T, (s - y))
                self.theta -= self.lr * gradient

                loss = self.loss_fn(y, s)
                acc = self.accuracy(y, s)

                self.losses.append(loss)
                self.metrics.append(acc)

                pb.set_description(f"Epoch {e+1} - Loss: {loss:.4f}, Acc: {acc:.4f}")

    def predict(self, X: np.ndarray) -> np.ndarray:
        #Dự đoán xác suất
        if X.shape[1] == self.theta.shape[0] - 1:
            X = np.hstack([np.ones((X.shape[0], 1)), X])
        p = self.softmax(np.dot(X, self.theta))
        return np.asarray(p)

    def predict_classes(self, X: np.ndarray) -> np.ndarray:
        #Dự đoán nhãn
        if X.shape[1] == self.theta.shape[0] - 1:
            X = mp.hstack([np.ones((X.shape[0], 1)), X])
        return np.argmax(self.predict(X), axis=1)


In [129]:
X = df_reduced.iloc[:, :-1].to_numpy()
y = df_reduced.iloc[:, -1].to_numpy()

In [130]:
y = np.array(y.tolist())

In [131]:
print(y.shape)

(4424, 3)


In [132]:
def split(X, y, train_size = 0.8):
    X = np.array(X)
    y = np.array(y)

    num_samples = X.shape[0]
    train_size = int(train_size * num_samples)

    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    return X_train, X_test, y_train, y_test

In [133]:
X_train,X_test,y_train,y_test = split(X,y,train_size = 0.8)

In [134]:
sm_reg = SoftmaxRegression(1000,0.2)

In [135]:
sm_reg.fit(X_train,y_train)

Epoch 1000 - Loss: 5.2534, Acc: 0.7304: 100%|██████████| 1000/1000 [00:04<00:00, 203.86it/s]


In [136]:
y_train_hat = sm_reg.predict(X_train)
acc_train = sm_reg.accuracy(y_train, y_train_hat)
print(f'Training Accuracy: {acc_train: .4f}')

Training Accuracy:  0.7191


In [137]:
y_test_hat = sm_reg.predict(X_test)
acc_test = sm_reg.accuracy(y_test, y_test_hat)
print(f'Test Accuracy: {acc_test: .4f}')

Test Accuracy:  0.7209


In [138]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [153]:
df_copy = df_reduced.copy() 

In [154]:
X1 = df_copy.iloc[:, :-1].to_numpy()
y1 = df_copy.iloc[:, -1].to_numpy()

In [159]:
y1 = np.vstack(y1)
print(y1.shape)  # Phải có ít nhất 2 chiều, ví dụ: (N, C)
print(y1[:5])  # In thử 5 phần tử đầu tiên để kiểm tra

(4424, 3)
[[1 0 0]
 [0 0 1]
 [1 0 0]
 [0 0 1]
 [0 0 1]]


In [160]:
y1 = np.argmax(y1, axis=1) 

In [161]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, train_size = 0.8, random_state = 42)

In [162]:
sm_reg1 = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 200)

In [163]:
sm_reg1.fit(X_train1, y_train1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [164]:
y_test_hat1 = sm_reg1.predict(X_test1)
print(f'Test Accuracy: {accuracy_score(y_test1, y_test_hat1):.4f}')

Test Accuracy: 0.7480


In [165]:
y_train_hat1 = sm_reg1.predict(X_train1)
print(f'Test Accuracy: {accuracy_score(y_train1, y_train_hat1):.4f}')

Test Accuracy: 0.7711
