In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.compose import ColumnTransformer

In [2]:
# Loading data
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [3]:
numerical_features = [
    "Application order","Age at enrollment", "Curricular units 1st sem (credited)", "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)","Curricular units 1st sem (approved)","Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)","Curricular units 2nd sem (credited)","Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)","Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (without evaluations)","Unemployment rate","Inflation rate","GDP","Curricular units 2nd sem (grade)"
]
categorical_features = [
    "Marital status","Application mode","Course","Daytime/evening attendance","Previous qualification","Nationality",
    "Mother's qualification","Father's qualification","Mother's occupation","Father's occupation","Displaced",
    "Educational special needs","Debtor","Tuition fees up to date","Gender","Scholarship holder","International"
]

target = "Target"


I. Math introduction

Because in our logistic regression we have 3 classes we have to implement one vs many solution, so our $W$ weight matrix has three rows where each row will have a weights where each row corresponds to one class and learns how to distinguish that class from all others.


First we have to do linear combination of input variables with their weights
$$ scores = X * W^T + b^T$$

But scores is a matrix with values from $-\inf$ to  $+\inf$ so we have to changes this to range from 0 to 1 because in logistic regression we are predicting a probability. This is called softmax function

$$ \hat{p_k} = \frac{\exp{(scores_k)}}{\sum_{j=1}^{K}\exp{(scores)}}$$


Because we are using gradient descent to get our model better, we will be minimalizing cost function. In our classification problem we are going to use Cross-Entropy cost function

$$J(W,b)  = -\frac{1}{m} \sum_{i=1}^{m} \sum_{k=1}^{K}(y_k*\log(p_k))$$

After one iteration we have to update our $W$ and $b$:


$$ \nabla_w{J(W,b)} = \frac{1}{m}\sum_{i=1}^{n}x(\hat{p_k} - y_k)$$

$$ \nabla_b{J(W,b)} = \frac{1}{m}\sum_{i=1}^{n}(\hat{p_k} - y_k)$$

So after one iteration we can update $W$ and $b$:

$$W_x = W - lr * dW$$

$$b_x = b - lr * db$$


We can repeat this process until current_iter < max_iteration

After iterations,  we have to part our set to $n$ batches. Batches are a subsets of current training set, and after one iteration te set on which our model making calculations is changing:
$$
X_{\text{new}} = \text{Batches}[\text{prev} + n_{\text{batches}}]
$$


$$ prev = prev + n_{batches} $$

In [4]:

class LogisticRegression:
    def __init__(self, lr=0.001, n_iters=10000, batch_size=256):
        self.lr = lr
        self.batch_idx = 0
        self.n_iters = n_iters
        self.batch_size = batch_size
        self.weights = None
        self.bias = None
        self.errors = []
        self.n_classes = None

    def batch_prepare(self, data_set_x, data_set_y):
        idx = self.batch_idx + self.batch_size
        if idx > data_set_x.shape[0]:
            batch_x = data_set_x[self.batch_idx:]
            batch_y = data_set_y[self.batch_idx:]
            self.batch_idx = 0
            return batch_x, batch_y

        batch_x = data_set_x[self.batch_idx:idx]
        batch_y = data_set_y[self.batch_idx:idx]
        self.batch_idx = idx

        return batch_x, batch_y

    def error_function(self, y, predicted, m_samples):
        return -np.sum(y * np.log(predicted + 1e-15)) / m_samples

    def fit(self, X, y):
        m_samples, n_features = X.shape
        self.n_classes = y.shape[1]
        self.weights = np.zeros((n_features, self.n_classes))
        self.bias = np.zeros((1, self.n_classes))

        for i in range(self.n_iters):
            indices = np.random.permutation(m_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for start_idx in range(0, m_samples, self.batch_size):
                end_idx = start_idx + self.batch_size
                X_batch = X_shuffled[start_idx:end_idx]
                y_batch = y_shuffled[start_idx:end_idx]

                predict = self.predict(X_batch)
                dw = (1 / X_batch.shape[0]) * np.dot(X_batch.T, (predict - y_batch))
                db = (1 / X_batch.shape[0]) * np.sum(predict - y_batch, axis=0, keepdims=True)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            full_predict = self.predict(X)
            loss = self.error_function(y, full_predict, m_samples)
            self.errors.append(loss)

            if i % 100 == 0:
                print(f"Iteration {i}, Loss: {loss:.4f}")


    def softmax(self,z):
        return np.exp(z)/np.sum(np.exp(z),axis=1,keepdims=True)

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        return self.softmax(z)

    def predict_class(self, X_test):
        probabilities = self.predict(X_test)
        return np.argmax(probabilities, axis=1).reshape(-1, 1)

    def score(self, predicted, y,to_show):
        y_true = np.argmax(y, axis=1)
        correct = (predicted.flatten() == y_true)
        accuracy = np.mean(correct)
        print("Correct predictions:", correct)
        print("Accuracy:", accuracy)
        if to_show:
            plt.plot(np.arange(len(self.errors)), self.errors)
            plt.show()


In [5]:
# HERE Y has to be in onehotencoder [0 0 ...  1]
X = df.drop(["Target"], axis=1)
y = df["Target"]

# Dropout -  0
# Graduate - 1
# Enrolled - 2

X_train,X_temp,y_train,y_temp = train_test_split(X,y,train_size=0.4,random_state=42)
X_val,X_test,y_val,y_test = train_test_split(X_temp,y_temp,random_state=42,train_size=0.5)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore',sparse_output=False), categorical_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

y_train_processed = pd.get_dummies(y_train,columns=["Target"]).astype(int).to_numpy()
y_test_processed = pd.get_dummies(y_test,columns=["Target"]).astype(int).to_numpy()
y_val_processed = pd.get_dummies(y_val,columns=["Target"]).astype(int).to_numpy()


#main part
clf = LogisticRegression()
clf.fit(X_train_processed,y_train_processed)


Epoch 0, Loss: 1.0897
Epoch 100, Loss: 0.8031
Epoch 200, Loss: 0.7405
Epoch 300, Loss: 0.7058
Epoch 400, Loss: 0.6824
Epoch 500, Loss: 0.6652
Epoch 600, Loss: 0.6519
Epoch 700, Loss: 0.6410
Epoch 800, Loss: 0.6320
Epoch 900, Loss: 0.6243
Epoch 1000, Loss: 0.6176
Epoch 1100, Loss: 0.6117
Epoch 1200, Loss: 0.6065
Epoch 1300, Loss: 0.6018
Epoch 1400, Loss: 0.5976
Epoch 1500, Loss: 0.5937
Epoch 1600, Loss: 0.5902
Epoch 1700, Loss: 0.5869
Epoch 1800, Loss: 0.5839
Epoch 1900, Loss: 0.5811
Epoch 2000, Loss: 0.5784
Epoch 2100, Loss: 0.5760
Epoch 2200, Loss: 0.5737
Epoch 2300, Loss: 0.5715
Epoch 2400, Loss: 0.5694
Epoch 2500, Loss: 0.5674
Epoch 2600, Loss: 0.5655
Epoch 2700, Loss: 0.5637
Epoch 2800, Loss: 0.5620
Epoch 2900, Loss: 0.5604
Epoch 3000, Loss: 0.5588
Epoch 3100, Loss: 0.5573
Epoch 3200, Loss: 0.5559
Epoch 3300, Loss: 0.5545
Epoch 3400, Loss: 0.5532
Epoch 3500, Loss: 0.5519
Epoch 3600, Loss: 0.5506
Epoch 3700, Loss: 0.5494
Epoch 3800, Loss: 0.5483
Epoch 3900, Loss: 0.5471
Epoch 4000, 

In [6]:
#TEST set
predicted = clf.predict_class(X_test_processed)
clf.score(predicted,y_test_processed,False)

#VALIDATION set
predicted = clf.predict_class(X_val_processed)
clf.score(predicted,y_val_processed,False)

#TRAIN set
predicted = clf.predict_class(X_train_processed)
clf.score(predicted,y_train_processed,False)

Correct predictions: [ True  True  True ... False  True  True]
Accuracy: 0.7650602409638554
Correct predictions: [False False  True ...  True  True False]
Accuracy: 0.7550866616428034
Correct predictions: [ True  True  True ...  True  True  True]
Accuracy: 0.8049745618993782


In [7]:
lm = linear_model.LogisticRegression(multi_class='multinomial',max_iter=100000)
lm.fit(X_train_processed, y_train)
s1 = lm.score(X_test_processed, y_test)
s2 = lm.score(X_val_processed, y_val)
s3 = lm.score(X_train_processed, y_train)

s1, s2, s3



(0.7695783132530121, 0.7596081386586285, 0.8224985867721877)

III. Pytorch implementation