I. Setup

After implemented from scratch multiclass logistic regression and after tested it we can now do next steps to increase efficiency, effectiveness and upgrade this base model.

Here it is abstract class of Logistic regression with empty fit function, we need to implement this fit function by yourself, because in next steps we will be adding there some upgrades and we need different class to compare it later

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer


In [None]:
# Loading data
df = pd.read_csv("dataset.csv")
df.head()

numerical_features = [
    "Application order","Age at enrollment", "Curricular units 1st sem (credited)", "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)","Curricular units 1st sem (approved)","Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)","Curricular units 2nd sem (credited)","Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)","Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (without evaluations)","Unemployment rate","Inflation rate","GDP","Curricular units 2nd sem (grade)"
]
categorical_features = [
    "Marital status","Application mode","Course","Daytime/evening attendance","Previous qualification","Nationality",
    "Mother's qualification","Father's qualification","Mother's occupation","Father's occupation","Displaced",
    "Educational special needs","Debtor","Tuition fees up to date","Gender","Scholarship holder","International"
]

target = "Target"

preprocessor_full_set = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore',sparse_output=False), categorical_features)
    ]
)
def transform_in_pipeline(preproc,X_ttrain,X_ttest,y_ttrain,y_ttest):
    return  (preproc.fit_transform(X_ttrain), preproc.transform(X_ttest),
             pd.get_dummies(y_ttrain,columns=["Target"]).astype(int).to_numpy(), pd.get_dummies(y_ttest,columns=["Target"]).astype(int).to_numpy())


In [None]:
# HERE Y has to be in onehotencoder [0 0 ...  1]
X = df.drop(["Target"], axis=1)
y = df["Target"]


X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)


In [None]:
from abc import ABC, abstractmethod
class LogisticRegression(ABC):
    def __init__(self, lr=0.001, n_iters=5000, batch_size=64):
        self.lr = lr
        self.batch_idx = 0
        self.n_iters = n_iters
        self.batch_size = batch_size
        self.weights = None
        self.bias = None
        self.errors = []
        self.n_classes = None
        self.probabilities = None
    def error_function(self, y, predicted, m_samples):
        return -np.sum(y * np.log(predicted + 1e-15)) / m_samples

    def softmax(self,z):
        return np.exp(z)/np.sum(np.exp(z),axis=1,keepdims=True)

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        return self.softmax(z)

    def predict_class(self, X_test):
        probabilities = self.predict(X_test)
        self.probabilities = probabilities
        return np.argmax(probabilities, axis=1).reshape(-1, 1)

    def score(self, predicted, y):
        y_true = np.argmax(y, axis=1)
        correct = (predicted.flatten() == y_true)
        accuracy = np.mean(correct)
        return accuracy

    def raw_error_data(self):
        return np.arange(len(self.errors)), self.errors

    @abstractmethod
    def fit(self, X, y):
       ...

First we start with base multiclass logistic regression without any specific upgrades

In [None]:
class BaseLogisticRegression(LogisticRegression):
    def fit(self,X,y):
        m_samples, n_features = X.shape
        self.n_classes = y.shape[1]
        self.weights = np.zeros((n_features, self.n_classes))
        self.bias = np.zeros((1, self.n_classes))
        self.errors.clear()

        for i in range(self.n_iters):
            indices = np.random.permutation(m_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for start_idx in range(0, m_samples, self.batch_size):
                end_idx = start_idx + self.batch_size
                X_batch = X_shuffled[start_idx:end_idx]
                y_batch = y_shuffled[start_idx:end_idx]

                predict = self.predict(X_batch)
                dw = (1 / X_batch.shape[0]) * np.dot(X_batch.T, (predict - y_batch))
                db = (1 / X_batch.shape[0]) * np.sum(predict - y_batch, axis=0, keepdims=True)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            full_predict = self.predict(X)
            loss = self.error_function(y, full_predict, m_samples)
            self.errors.append(loss)

            if i % 100 == 0:
                print(f"Iteration {i}, Loss: {loss:.4f}")




Now we can train and predict results on these subsets, which were made with random_state=$42$

In [None]:
X_train_proc,X_test_proc,y_train_proc,y_test_proc = transform_in_pipeline(preprocessor_full_set,X_train, X_test, y_train, y_test)

logistic_regression1 = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)

In [None]:
logistic_regression1.fit(X_train_proc, y_train_proc)
predicted = logistic_regression1.predict_class(X_test_proc)

logistic_regression1.score(predicted, y_test_proc)

But now we can change this random_state variable to show that is random to generate a subsets,and based on this random we can get bad or worse score

In [None]:
X_train2,X_test2,y_train2,y_test2 = train_test_split(X,y,train_size=0.7,random_state=11)
X_train_proc2,X_test_proc2,y_train_proc2,y_test_proc2 = transform_in_pipeline(preprocessor_full_set,X_train2, X_test2, y_train2, y_test2)

logistic_regression2 = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)


In [None]:
logistic_regression2.fit(X_train_proc2, y_train_proc2)
predicted2 = logistic_regression2.predict_class(X_test_proc2)
logistic_regression2.score(predicted2, y_test_proc2)


\begin{align*}
\text{accuracy}(\text{random\_state} = 42) &= 0.7665 \\
\text{accuracy}(\text{random\_state} = 11) &= 0.7710
\end{align*}

And we can see that is real random, so we have to find way to do this more independent of random_state


II. Cross validation (StratifiedKFold)

In [None]:
from sklearn.model_selection import StratifiedKFold

def make_cross_validation_predict(model,preproc, X_set, y_set, k=3):
    scores = []
    skf = StratifiedKFold(k, shuffle=True)

    y_encoded = LabelEncoder().fit_transform(y_set)

    for train_index, test_index in skf.split(X_set, y_encoded):
        X_train_raw = X_set.iloc[train_index]
        X_test_raw = X_set.iloc[test_index]

        y_train_raw = y_set.iloc[train_index]
        y_test_raw = y_set.iloc[test_index]

        X_train = preproc.fit_transform(X_train_raw)
        X_test = preproc.transform(X_test_raw)

        y_train = pd.get_dummies(y_train_raw).astype(int).to_numpy()
        y_test = pd.get_dummies(y_test_raw).astype(int).to_numpy()
        model.fit(X_train, y_train)
        predicted = model.predict_class(X_test)
        score = model.score(predicted, y_test)
        scores.append(score)


    return scores, np.mean(scores)


In [None]:
logistic_regression3 = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)

make_cross_validation_predict(logistic_regression3, preprocessor_full_set, X_train, y_train)

And now we can see this what we saw before, that in every run, we get different prediction. So for this reason using cross-validation is is important therefore in next step of upgrades we will be using this method.


II. Over/under-fitting and error plot

First we have to search if we have either overfitting or underfitting using our base logistic regression model.

In [None]:
base_CV_logistic_regression = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)
make_cross_validation_predict(base_CV_logistic_regression, preprocessor_full_set, X_train, y_train)

In [None]:
base_logistic_regression = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)

In [None]:
#train base_logistic_regression
base_logistic_regression.fit(X_train_proc,y_train_proc)

In [None]:
predicted = base_logistic_regression.predict_class(X_train_proc)
print(base_logistic_regression.score(predicted, y_train_proc))
x1,y1 = base_logistic_regression.raw_error_data()


In [None]:
predicted = base_logistic_regression.predict_class(X_test_proc)
print(base_logistic_regression.score(predicted, y_test_proc))
x2,y2 = base_logistic_regression.raw_error_data()

In [None]:
plt.plot(x1, y1, label="Train set", color='blue', linestyle='-')
plt.plot(x2, y2, label="Test set", color='orange', linestyle='--')

plt.legend()
plt.show()


Based on this plot we can say that our lines are close to each other, so we don't have problem with under or over-fitting

And also we have scores:
\begin{align*}
\text{accuracy}(\text{CV}) &= 0.7719 \\
\text{accuracy}(\text{on train}) &= 0.8016 \\
\text{accuracy}(\text{on test}) &= 0.7665
\end{align*}
We can see that $Test~\approx~CV~\approx~Train$ so it means that we don't have problem with underfitting or overfitting

Although we don't have problem with under/over - fitting still we can try to improve model with PolynomialFeatures


In [None]:
numerical_pipeline_poly = Pipeline([
    ('poly',PolynomialFeatures(degree=2,include_bias=False)),
    ('scaler',StandardScaler())
])

polynomial_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline_poly, numerical_features),
        ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features)
    ]
)


In [None]:
polynomial_CV_logistic_regression = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)
make_cross_validation_predict(polynomial_CV_logistic_regression, polynomial_preprocessor, X_train, y_train)

In [None]:
X_train_proc_poly,X_test_proc_poly,y_train_proc_poly,y_test_proc_poly = transform_in_pipeline(polynomial_preprocessor,X_train, X_test, y_train, y_test)

In [None]:
polynomial_base_logistic_regression = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)
polynomial_base_logistic_regression.fit(X_train_proc_poly,y_train_proc_poly)

In [None]:
predicted = polynomial_base_logistic_regression.predict_class(X_train_proc_poly)
print(polynomial_base_logistic_regression.score(predicted, y_train_proc_poly))

In [None]:
predicted = polynomial_base_logistic_regression.predict_class(X_test_proc_poly)
print(polynomial_base_logistic_regression.score(predicted, y_test_proc_poly))

And we can see that we get little better scores using this modified preprocessor.

Now we can ask question, do we need all of the columns in training our model, maybe we can delete some of them and our score won't be worse

In [None]:
numerical_features_modified = [
    "Application order","Age at enrollment", "Curricular units 1st sem (credited)", "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)","Curricular units 1st sem (approved)","Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)","Curricular units 2nd sem (credited)","Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)","Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (without evaluations)","Curricular units 2nd sem (grade)"
]
categorical_features_modified = [
    "Marital status","Application mode","Course","Daytime/evening attendance","Previous qualification",
    "Mother's qualification","Father's qualification","Mother's occupation","Father's occupation","Displaced",
    "Educational special needs","Debtor","Tuition fees up to date","Gender","Scholarship holder"
]


df_modified = pd.read_csv("dataset.csv")
#pipeline

In [None]:
y = df_modified["Target"]
X = df_modified[numerical_features_modified+categorical_features_modified]

In [None]:
numerical_pipeline_modified_features = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

modified_features_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline_modified_features, numerical_features_modified),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features_modified)
    ]
)
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)

In [None]:
modified_features_CV_logistic_regression = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)
print(make_cross_validation_predict(modified_features_CV_logistic_regression, modified_features_preprocessor, X_train, y_train))
print(modified_features_CV_logistic_regression.weights)

hAfter few tries we got new sets of features, which is smaller that entry features set, and now score after CV is slightly better.

Moving to the next steps, we are going with:

- smaller dataset
- `modified_features_preprocessor` preprocessor with `PolynomialFeatures`
- `X_train`, `X_test`, `y_train`, `y_test`


III. L1, L2 regularization
importance of the input

Regularization method are applied to regression model because we want to have more control on weights in our model, because weights are responsible for features importance in
training model. So we can use regularization techniques to work on these  weights. We to penalize model for to high weights.

- `L1 (lasso)`
$\lambda \sum_{i,j} |W_{ij}|$

This method are going to even make some features zero by zero weight to eliminate unnecessary feautres
- `L2 (ridge)`
$\lambda \sum_{i,j} W_{ij}^2$

This method are going to penalize model more significant because of square

In [None]:
#X_train_proc,X_test_proc,y_train_proc,y_test_proc = transform_in_pipeline(polynomial_preprocessor,X_train, X_test, y_train, y_test)

In [None]:
class BaseLogisticRegressionRegL1(LogisticRegression):
    def __init__(self,n_iters=5000, lr=0.001, batch_size=64,l=0.001):
        super().__init__(lr, n_iters, batch_size)
        self.l = l

    def fit(self,X,y):
        m_samples, n_features = X.shape
        self.n_classes = y.shape[1]
        self.weights = np.zeros((n_features, self.n_classes))
        self.bias = np.zeros((1, self.n_classes))
        self.errors.clear()

        for i in range(self.n_iters):
            indices = np.random.permutation(m_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for start_idx in range(0, m_samples, self.batch_size):
                end_idx = start_idx + self.batch_size
                X_batch = X_shuffled[start_idx:end_idx]
                y_batch = y_shuffled[start_idx:end_idx]

                predict = self.predict(X_batch)
                dw = (1 / X_batch.shape[0]) * np.dot(X_batch.T, (predict - y_batch))

                dw+=self.l * np.sign(self.weights)

                db = (1 / X_batch.shape[0]) * np.sum(predict - y_batch, axis=0, keepdims=True)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            full_predict = self.predict(X)
            loss = self.error_function(y, full_predict, m_samples)
            self.errors.append(loss)

            if i % 100 == 0:
                print(f"Iteration {i}, Loss: {loss:.4f}")



In [None]:
class BaseLogisticRegressionRegL2(LogisticRegression):
    def __init__(self,n_iters=5000, lr=0.001, batch_size=64,l=0.001):
        super().__init__(lr, n_iters, batch_size)
        self.l = l
    def fit(self,X,y):
        m_samples, n_features = X.shape
        self.n_classes = y.shape[1]
        self.weights = np.zeros((n_features, self.n_classes))
        self.bias = np.zeros((1, self.n_classes))
        self.errors.clear()

        for i in range(self.n_iters):
            indices = np.random.permutation(m_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for start_idx in range(0, m_samples, self.batch_size):
                end_idx = start_idx + self.batch_size
                X_batch = X_shuffled[start_idx:end_idx]
                y_batch = y_shuffled[start_idx:end_idx]

                predict = self.predict(X_batch)
                dw = (1 / X_batch.shape[0]) * np.dot(X_batch.T, (predict - y_batch))

                dw += self.l * 2 * self.weights

                db = (1 / X_batch.shape[0]) * np.sum(predict - y_batch, axis=0, keepdims=True)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            full_predict = self.predict(X)
            loss = self.error_function(y, full_predict, m_samples)
            self.errors.append(loss)

            if i % 100 == 0:
                print(f"Iteration {i}, Loss: {loss:.4f}")



In [None]:
logistic_regression_L1_reg = BaseLogisticRegressionRegL1(n_iters=5000, lr=0.001, batch_size=64,l=0.001)
print(make_cross_validation_predict(logistic_regression_L1_reg, modified_features_preprocessor, X_train, y_train))
print(logistic_regression_L1_reg.weights)

In [None]:
logistic_regression_L2_reg = BaseLogisticRegressionRegL2(n_iters=5000, lr=0.001, batch_size=64,l=0.001)
print(make_cross_validation_predict(logistic_regression_L2_reg, modified_features_preprocessor, X_train, y_train))
print(logistic_regression_L2_reg.weights)


In [None]:
class BaseLogisticRegressionRegCombined(LogisticRegression):
    def __init__(self,n_iters=5000, lr=0.001, batch_size=64,l=0.001,alpha=0.7):
        super().__init__(lr, n_iters, batch_size)
        self.l = l
        self.alpha = alpha
    def fit(self,X,y):
        m_samples, n_features = X.shape
        self.n_classes = y.shape[1]
        self.weights = np.zeros((n_features, self.n_classes))
        self.bias = np.zeros((1, self.n_classes))
        self.errors.clear()

        for i in range(self.n_iters):
            indices = np.random.permutation(m_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for start_idx in range(0, m_samples, self.batch_size):
                end_idx = start_idx + self.batch_size
                X_batch = X_shuffled[start_idx:end_idx]
                y_batch = y_shuffled[start_idx:end_idx]

                predict = self.predict(X_batch)
                l2_gradient = self.l * (1 - self.alpha) * 2 * self.weights
                l1_gradient = self.l * self.alpha * np.sign(self.weights)
                dw = (1 / X_batch.shape[0]) * np.dot(X_batch.T, (predict - y_batch))
                dw += l1_gradient+l2_gradient
                db = (1 / X_batch.shape[0]) * np.sum(predict - y_batch, axis=0, keepdims=True)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

            full_predict = self.predict(X)
            loss = self.error_function(y, full_predict, m_samples)
            self.errors.append(loss)

            if i % 100 == 0:
                print(f"Iteration {i}, Loss: {loss:.4f}")



In [None]:
logistic_regression_L1L2 = BaseLogisticRegressionRegCombined(n_iters=5000, lr=0.001, batch_size=128,l=0.001,alpha=0.50)
print(make_cross_validation_predict(logistic_regression_L1L2, modified_features_preprocessor, X_train, y_train))

In [None]:
#weights before L1
plt.hist((modified_features_CV_logistic_regression.weights).flatten(), bins=100)
plt.xlabel("Absolute weight value")
plt.ylabel("Frequency")
plt.title("Histogram of absolute weights")
plt.show()

In [None]:
#weights after L1
plt.hist(logistic_regression_L1_reg.weights.flatten(), bins=100)
plt.xlabel("Absolute weight value")
plt.ylabel("Frequency")
plt.title("Histogram of absolute weights")
plt.show()


We can also see that mean accuracy before and after implemented L1 and L2 are very similar, thus can confirm that our model is not overfitted.

IV. Dataset balancing

Dataset balancing is crucial in model training because when we are training our data on some part of full dataset it can be possible that occurrence one class can be much bigger than others, so after training model on unbalanced dataset model can simply always guess this class which occurrence were the biggest in training set. It is oversampling, we have similar situation with undersampling when it's the other way around.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC

d_set = numerical_features_modified+categorical_features_modified

def sampling_method_pipeline(X_ttrain, y_ttrain, method):
    X_ttrain = X_ttrain.copy()
    y_ttrain = y_ttrain.copy()

    le = LabelEncoder()
    y_ttrain = le.fit_transform(y_ttrain.values.ravel() if isinstance(y_ttrain, pd.DataFrame) else y_ttrain)
    resampler = None
    match method:
        case "SMOTE":
            resampler = SMOTENC(
            categorical_features=categorical_features_modified,
            random_state=42
            )
        case "RandomUnderSampler":
            resampler = RandomUnderSampler(random_state=42)
        case _:
            raise ValueError(f"Unknown method: {method}")

    X_resampled, y_resampled = resampler.fit_resample(X_ttrain, y_ttrain)

    return X_resampled, y_resampled


In [None]:
def make_cross_validation_predict_sampling(model, preproc, X_set, y_set, sampling_method=None, k=3):
    scores = []
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    y_encoded = LabelEncoder().fit_transform(y_set)

    for train_index, test_index in skf.split(X_set, y_encoded):
        X_train_raw = X_set.iloc[train_index]
        X_test_raw = X_set.iloc[test_index]

        y_train_raw = y_set.iloc[train_index]
        y_test_raw = y_set.iloc[test_index]

        if sampling_method is not None:
            X_train_resampled, y_train_resampled = sampling_method_pipeline(pd.DataFrame(X_train_raw), y_train_raw, sampling_method)
            X_train = X_train_resampled
            y_train = y_train_resampled
        else:
            X_train = X_train_raw
            y_train = y_train_raw.values.ravel()

        X_train = preproc.fit_transform(X_train)
        X_test = preproc.transform(X_test_raw)

        y_test_encoded = pd.get_dummies(y_test_raw).astype(int).to_numpy()
        y_train_encoded = pd.get_dummies(y_train).astype(int).to_numpy()

        model.fit(X_train, y_train_encoded)

        predicted = model.predict_class(X_test)
        score = model.score(predicted, y_test_encoded)
        scores.append(score)
        y_encoded_t =np.argmax(y_test_encoded,axis=1)
        print(classification_report(y_encoded_t, predicted, digits=4))

    return scores, np.mean(scores)



In [None]:
logistic_regression_L2 = BaseLogisticRegressionRegL2(n_iters=5000, lr=0.001, batch_size=64,l=0.001)
print(make_cross_validation_predict_sampling(logistic_regression_L2, modified_features_preprocessor, X_train, y_train,sampling_method="RandomUnderSampler"))

In [None]:
from sklearn.metrics import classification_report

In [None]:
X_train_proc, X_test_proc, y_train_proc, y_test_proc = transform_in_pipeline(modified_features_preprocessor, X_train, X_test,y_train,y_test)
log_reg = BaseLogisticRegressionRegL2(n_iters=5000, lr=0.001, batch_size=64)
log_reg.fit(X_train_proc, y_train_proc)
predicted = log_reg.predict_class(X_test_proc)



In [None]:
y_test_proc = np.argmax(y_test_proc,axis=1) if y_test_proc.ndim>1 else y_test_proc
print(classification_report(y_test_proc, predicted, digits=4))


In [None]:
X_res_train,y_res_train = sampling_method_pipeline(X_train,y_train,method="SMOTE")
X_train_proc, X_test_proc, y_train_proc, y_test_proc = transform_in_pipeline(modified_features_preprocessor, X_res_train, X_test,y_res_train,y_test)
log_reg = BaseLogisticRegressionRegL2(n_iters=5000, lr=0.001, batch_size=64)
log_reg.fit(X_train_proc, y_train_proc)
predicted = log_reg.predict_class(X_test_proc)


In [None]:
y_test_proc = np.argmax(y_test_proc,axis=1) if y_test_proc.ndim>1 else y_test_proc
print(classification_report(y_test_proc, predicted, digits=4))


In [None]:
X_res_train,y_res_train = sampling_method_pipeline(X_train,y_train,method="RandomUnderSampler")

In [None]:
X_train_proc, X_test_proc, y_train_proc, y_test_proc = transform_in_pipeline(modified_features_preprocessor, X_res_train, X_test,y_res_train,y_test)
log_reg = BaseLogisticRegressionRegL2(n_iters=5000, lr=0.001, batch_size=64)
log_reg.fit(X_train_proc, y_train_proc)
predicted = log_reg.predict_class(X_test_proc)



In [None]:
y_test_proc = np.argmax(y_test_proc,axis=1) if y_test_proc.ndim>1 else y_test_proc
print(classification_report(y_test_proc, predicted, digits=4))


So here me made three runs of regression model to show differences between dataset without sampling and with sampling.
- I scores
We can see that this model ignores 1 class and favors 2 class, so we can assume that this is unbalanced dataset, because of better model correctness of dominated class, than minor class
- II scores (after balanced)
We can see that here was used some balanced techniques because of better balance between recall factor for all three classes
- III scores (after balanced)
This scores looks the best from this three because the recall value for all of them is high and similar to each other, so we don't have situation that model favours one of classes

V. Optimizing hiperparmeters

Out models have a variables that is constant during training our model so we have to be sure that our set values are good enough. To be sure we can optimizing it in `GridSearchCV`
We are working of logistic regression variances so we will optimize these parameters:
- `BaseLogisticRegresion` - `n_iter`  |  `lr`  |  `batch_size`
- `BaseLogisticRegressionRegCombined` - `n_iter`  |  `lr`  |  `batch_size` | `l` |`alpha`

In [None]:
from itertools import product
# BASE LOGISTIC REGRESSION
param_grid = {
    "n_iters":[3000,4000,5000,6000,7000,8000,9000,10000],
    "lr":[0.001,0.0001,0.00001],
    "batch_size":[32,64,128]
}
best = -1
best_params = None
keys = list(param_grid.keys())
combinations = list(product(*param_grid.values()))
for combo in combinations:
    params = dict(zip(keys,combo))
    model = BaseLogisticRegression(**params)
    mean = make_cross_validation_predict(model,
                                           modified_features_preprocessor,
                                           X_train, y_train,
                                           k=2)[1]
    if mean > best:
        best = mean
        best_params = params
print(best_params,best)

In [None]:
from itertools import product
# BASE LOGISTIC REGRESSION WITH L1 L2
param_grid = {
    "n_iters":[4000,5000,6000],
    "lr":[0.001,0.0001,0.00001],
    "batch_size":[64,128],
    "l" :[0.001,0.0001,0.00001],
    "alpha" :[0.5,0.6,0.7,0.8,0.9]

}
best = -1
best_params = None
keys = list(param_grid.keys())
combinations = list(product(*param_grid.values()))
for i,combo in enumerate(combinations):
    print(i, len(combinations))
    params = dict(zip(keys,combo))
    model = BaseLogisticRegressionRegCombined(**params)
    mean = make_cross_validation_predict(model,
                                           modified_features_preprocessor,
                                           X_train, y_train,
                                           k=2)[1]
    if mean > best:
        best = mean
        best_params = params
print(best_params,best)

In [None]:
from itertools import product
# BASE LOGISTIC REGRESSION WITH L1 L2
param_grid = {
    "n_iters":[4000,5000,6000],
    "lr":[0.001,0.0001,0.00001],
    "batch_size":[64,128],
    "l" :[0.001,0.0001,0.00001],
}
best = -1
best_params = None
keys = list(param_grid.keys())
combinations = list(product(*param_grid.values()))
for i,combo in enumerate(combinations):
    print(i, len(combinations))
    params = dict(zip(keys,combo))
    model = BaseLogisticRegressionRegL2(**params)
    mean = make_cross_validation_predict(model,
                                           modified_features_preprocessor,
                                           X_train, y_train,
                                           k=2)[1]
    if mean > best:
        best = mean
        best_params = params
print(best_params,best)

WVI. Ensemble methods

In [None]:
import seaborn as sns
X_train_proc1, X_test_proc1, y_train_proc1, y_test_proc1 = transform_in_pipeline(
    modified_features_preprocessor, X_train, X_test, y_train, y_test)

X_train_proc2, X_test_proc2, y_train_proc2, y_test_proc2 = transform_in_pipeline(
    preprocessor_full_set, X_train, X_test, y_train, y_test)
base = BaseLogisticRegression(n_iters=5000, lr=0.001, batch_size=64)
base_L1 = BaseLogisticRegressionRegL1(n_iters=5000, lr=0.001, batch_size=64, l=0.001)
base_L2 =BaseLogisticRegressionRegL2(n_iters=5000, lr=0.001, batch_size=64, l=0.001)
base_L1L2 = BaseLogisticRegressionRegCombined(n_iters=5000, lr=0.001, batch_size=64, l=0.00101, alpha=0.70)
models_list = [
    (base, "base",1),
    (base_L1, "base+L1",1),
    (base_L2, "base+L2",2),
    (base_L1L2, "base+L1L2",2)
]

trained_models = []
for model, name,num in models_list:
    if num==1:
        model.fit(X_train_proc1,y_train_proc1)
    if num==2:
        model.fit(X_train_proc2,y_train_proc2)
    trained_models.append((model, name,num))

predictions = {}
errors = {}

for model, name,num in trained_models:
    if num==1:
        y_pred = model.predict_class(X_test_proc1)
        predictions[name] = y_pred
        y_true = np.argmax(y_test_proc1, axis=1) if y_test_proc1.ndim > 1 else y_test_proc1
        errors[name] = (y_pred.flatten() != y_true).astype(int)
    if num==2:
        y_pred = model.predict_class(X_test_proc2)
        predictions[name] = y_pred
        y_true = np.argmax(y_test_proc2, axis=1) if y_test_proc2.ndim > 1 else y_test_proc2
        errors[name] = (y_pred.flatten() != y_true).astype(int)



errors_df = pd.DataFrame(errors)

error_correlation = errors_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(error_correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Macierz korelacji błędów między modelami')
plt.tight_layout()
plt.show()

We can see on this matrix that base model and base_L1L2 model has the lowest correlation score so they are better in different records (but still this is high correlation score)

Ensemble methods:
- HardVotingClassifier
Here we have n classifiers and every of that returning their prediction for every record, HardVotingClassifier choose the most frequent label  and this is final prediction

- StackingClassifier

In [None]:
models = [
    BaseLogisticRegression(n_iters=7000, lr=0.001, batch_size=64),
    BaseLogisticRegressionRegCombined(n_iters=4000, lr=0.001, batch_size=128, l=0.0010, alpha=0.50)
    ]
def HardVotingClassifier(models,X_train,y_train,X_test,y_test):
    models[0].fit(X_train,y_train)
    models[1].fit(X_train,y_train)
    y_pred_1 = models[0].predict_class(X_test)
    y_pred_2 = models[1].predict_class(X_test)
    pred = []
    for row1,row2 in zip(y_pred_1,y_pred_2):
        d ={0:0,1:0,2:0}
        for elem in row1:
            d[elem]+=1
        for elem in row2:
            d[elem]+=1
        pred.append(max(d, key=d.get))

    return pred

In [None]:
pred = HardVotingClassifier(models,X_train_proc1,y_train_proc1,X_test_proc1,y_test_proc1)
p = np.array(pred)
def score(predicted, y):
    y_true = np.argmax(y, axis=1)
    correct = (predicted.flatten() == y_true)
    accuracy = np.mean(correct)
    return accuracy
score(p,y_test_proc1)

In [None]:
def make_cross_validation_predict_voting(models, preproc, X_set, y_set, k=3):
    scores = []
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    y_encoded = LabelEncoder().fit_transform(y_set)

    for train_index, test_index in skf.split(X_set, y_encoded):
        X_train_raw = X_set.iloc[train_index]
        X_test_raw = X_set.iloc[test_index]

        y_train_raw = y_set.iloc[train_index]
        y_test_raw = y_set.iloc[test_index]

        X_train = preproc.fit_transform(X_train_raw)
        X_test = preproc.transform(X_test_raw)

        y_train = pd.get_dummies(y_train_raw).astype(int).to_numpy()
        y_test = pd.get_dummies(y_test_raw).astype(int).to_numpy()

        models[0].fit(X_train, y_train)
        models[1].fit(X_train, y_train)

        y_pred_1 = models[0].predict_class(X_test)
        y_pred_2 = models[1].predict_class(X_test)

        # Hard voting
        pred = []
        for row1, row2 in zip(y_pred_1, y_pred_2):
            d = {0: 0, 1: 0, 2: 0}
            for elem in row1:
                d[elem] += 1
            for elem in row2:
                d[elem] += 1
            pred.append(max(d, key=d.get))

        pred = np.array(pred)

        fold_score = score(pred, y_test)
        scores.append(fold_score)

    return scores, np.mean(scores)

def score(predicted, y):
    y_true = np.argmax(y, axis=1)
    correct = (predicted.flatten() == y_true)
    return np.mean(correct)
print(make_cross_validation_predict_voting(models, modified_features_preprocessor, X_train, y_train))

In [None]:
models = [
    BaseLogisticRegression(n_iters=7000, lr=0.001, batch_size=64),
    BaseLogisticRegressionRegCombined(n_iters=4000, lr=0.001, batch_size=128, l=0.0010, alpha=0.50)
    ]
def StackingClassifier(models,X_train,y_train,X_test,y_test):
    models[0].fit(X_train,y_train)
    models[1].fit(X_train,y_train)
    models[0].predict_class(X_test)
    models[1].predict_class(X_test)
    prob1 = models[0].probabilities
    prob2 = models[1].probabilities
    results = []
    for row1,row2 in zip(prob1,prob2):
        c = np.hstack((row1,row2))
        results.append(c)
    return np.array(results)

In [None]:
pred = StackingClassifier(models,X_train_proc1,y_train_proc1,X_test_proc1,y_test_proc1)
pred

In [None]:
y =y_test_proc1.copy()
model = BaseLogisticRegressionRegCombined(n_iters=4000, lr=0.001,batch_size=128,l=0.001, alpha=0.5)
model.fit(pred, y)
predicted = model.predict_class(pred)
model.score(predicted, y_test_proc1)


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def score(predicted, y):
    y_true = np.argmax(y, axis=1)
    correct = (predicted.flatten() == y_true)
    return np.mean(correct)

def make_cross_validation_predict_stacking(model_factories, meta_model_factory, preproc, X_set, y_set, k=5):
    scores = []
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    y_encoded = LabelEncoder().fit_transform(y_set)

    for train_index, test_index in skf.split(X_set, y_encoded):
        X_train_raw = X_set.iloc[train_index]
        X_test_raw = X_set.iloc[test_index]
        y_train_raw = y_set.iloc[train_index]
        y_test_raw = y_set.iloc[test_index]

        # preprocess
        X_train = preproc.fit_transform(X_train_raw)
        X_test = preproc.transform(X_test_raw)

        y_train = pd.get_dummies(y_train_raw).astype(int).to_numpy()
        y_test = pd.get_dummies(y_test_raw).astype(int).to_numpy()

        base_preds_train = []
        base_preds_test = []

        for factory in model_factories:
            model = factory()
            model.fit(X_train, y_train)

            pred_train = model.predict_class(X_train)
            pred_test = model.predict_class(X_test)

            pred_train = np.array([np.bincount(row).argmax() for row in pred_train]) #count most frequent classes
            pred_test = np.array([np.bincount(row).argmax() for row in pred_test])

            base_preds_train.append(pred_train.reshape(-1, 1))
            base_preds_test.append(pred_test.reshape(-1, 1))

        stacked_train = np.hstack(base_preds_train)
        stacked_test = np.hstack(base_preds_test)


        meta_model = meta_model_factory()
        meta_model.fit(stacked_train, y_train)

        final_pred = meta_model.predict_class(stacked_test)
        final_pred = np.array([np.bincount(row).argmax() for row in final_pred])

        fold_score = score(final_pred, y_test)
        scores.append(fold_score)

    return scores, np.mean(scores)
model_factories = [
    lambda: BaseLogisticRegression(n_iters=5000, lr=0.001),
    lambda: BaseLogisticRegressionRegCombined(n_iters=4000, lr=0.001, batch_size=128, l=0.0010, alpha=0.50)
]

meta_model_factory = lambda: BaseLogisticRegression(n_iters=3000, lr=0.001)

scores, mean_score = make_cross_validation_predict_stacking(
    model_factories,
    meta_model_factory,
    modified_features_preprocessor,
    X_train,
    y_train,
    k=3
)

print("Fold scores:", scores)
print("Mean score:", mean_score)




VII. Mixture of Experts
Mixture of experts is a method where "supervisor" assigns weight (in meaning of importance of particular model from experts models embedded in this method). This "supervisor" here is called Gating Network - this is way to assign a value of trust to each model


In [None]:
X_train_proc1, X_test_proc1, y_train_proc1, y_test_proc1 = transform_in_pipeline(
    modified_features_preprocessor, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

class MixtureOfExperts:
    def __init__(self, model1, model2, num_classes):
        self.model1 = model1
        self.model2 = model2
        self.gating_model = RandomForestClassifier()
        self.num_classes = num_classes

    def fit(self, X_train, y_train):
        self.model1.fit(X_train, y_train)
        self.model2.fit(X_train, y_train)

        self.model1.predict_class(X_train)
        proba1 = self.model1.probabilities
        self.model2.predict_class(X_train)
        proba2 = self.model1.probabilities

        if len(y_train.shape) > 1:
            y_true = np.argmax(y_train, axis=1)
        else:
            y_true = y_train

        X_gating_train = []
        gating_labels = []

        for i in range(len(X_train)):
            pred1 = np.argmax(proba1[i])
            pred2 = np.argmax(proba2[i])

            acc1 = int(pred1 == y_true[i])
            acc2 = int(pred2 == y_true[i])

            if acc1 > acc2:
                label = 0
            elif acc2 > acc1:
                label = 1
            else:
                label = i % 2  #random

            features = np.concatenate([
                X_train[i].flatten(),
                proba1[i],
                proba2[i],
                np.abs(proba1[i] - proba2[i])
            ])
            X_gating_train.append(features)
            gating_labels.append(label)

        X_gating_train = np.array(X_gating_train)
        gating_labels = np.array(gating_labels)

        if len(gating_labels) == 0:
            return

        self.gating_model.fit(X_gating_train, gating_labels)

    def predict(self, X_test):
        self.model1.predict_class(X_test)
        proba1 = self.model1.probabilities
        self.model2.predict_class(X_test)
        proba2 = self.model2.probabilities

        X_gating_test = []
        for i in range(len(X_test)):
            features = np.concatenate([
                X_test[i].flatten(),
                proba1[i],
                proba2[i],
                np.abs(proba1[i] - proba2[i])
            ])
            X_gating_test.append(features)
        X_gating_test = np.array(X_gating_test)

        gating_probs = self.gating_model.predict_proba(X_gating_test)

        final_preds = []
        for i in range(len(X_test)):
            weighted_vote = gating_probs[i][0] * proba1[i] + gating_probs[i][1] * proba2[i]
            final_preds.append(np.argmax(weighted_vote))
        return np.array(final_preds)

    def score(self, X_test, y_test):
        preds = self.predict(X_test)
        if len(y_test.shape) > 1:
            y_true = np.argmax(y_test, axis=1)
        else:
            y_true = y_test
        return np.mean(preds == y_true)


First, we train our expert models independently. Then, for each record in the training data, we determine which expert made the correct prediction. Based on this, we build a gating matrix, which learns to identify which expert is more likely to be correct given a specific input sample X

During prediction, for each test record, we retrieve the class probability distributions from both experts. We then weight these probabilities using the gating model's output (i.e., its trust in each expert). Finally, we combine the weighted predictions and select the class with the highest final score as the predicted class.

In [None]:
m1 = BaseLogisticRegression(n_iters=7000, lr=0.001, batch_size=64)
m2 = BaseLogisticRegressionRegCombined(n_iters=4000, lr=0.001, batch_size=128, l=0.00101, alpha=0.70)
moe = MixtureOfExperts(m1, m2,3)
moe.fit(X_train_proc1, y_train_proc1)
acc = moe.score(X_test_proc1, y_test_proc1)
print(f"Mixture of Experts accuracy: {acc:.4f}")


In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
df = pd.read_csv("dataset.csv")
X = df.drop(["Target"],axis=1)
df["Target"] = LabelEncoder().fit_transform(df["Target"])
y = df["Target"]

In [None]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features_modified),
    ('cat', categorical_pipeline, categorical_features_modified)
])


random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

# Siatka hiperparametrów
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Grid Search
grid_search = GridSearchCV(random_forest_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Wyniki
print( grid_search.best_params_)
print(grid_search.best_score_)