In [6]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from functions import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix
import joblib
import streamlit as st
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
import numpy as np


# Loading the data set

In [2]:
mnist = fetch_openml('mnist_784', version=1, cache= True, as_frame= False)

x_org = mnist["data"]
y_org = mnist["target"].astype(np.uint8)

x = x_org[:30000]
y = y_org[:30000]


x_train_not_scaled, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_val_not_scaled, x_test_not_scaled, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

x_train_not_scaled_70, x_temp_70, y_train_70, y_temp_70 = train_test_split(x_org, y_org, test_size=0.2, random_state=42)
x_val_not_scaled_70, x_test_not_scaled_70, y_val_70, y_test_70 = train_test_split(x_temp_70, y_temp_70, test_size=0.5, random_state=42)

# Scaling the data set

In [3]:
scaler_30 = StandardScaler()
x_train = scaler_30.fit_transform(x_train_not_scaled)
x_val = scaler_30.transform(x_val_not_scaled)
x_test = scaler_30.transform(x_test_not_scaled)

In [4]:
scaler_70 = StandardScaler()
x_train_70 = scaler_70.fit_transform(x_train_not_scaled_70)
x_val_70 = scaler_70.transform(x_val_not_scaled_70)
x_test_70 = scaler_70.transform(x_test_not_scaled_70)

In [7]:
np.save('x_test.npy', x_test)
np.save('y_test.npy', y_test)

In [5]:
joblib.dump(scaler_30, get_scaler_name())

['scaler.pkl']

# Creating SVM model

In [16]:
svm_pipeline = make_pipeline(StandardScaler(), SVC())
svm_param_grid = {'svc__C': [1, 10, 20], 'svc__gamma': [0.1, 0.01, 0.001], 'svc__kernel': ['rbf']}
svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(x_train, y_train)
best_svm_model = svm_grid_search.best_estimator_

y_pred_val_svm = best_svm_model.predict(x_val)

svm_val_accuracy = accuracy_score(y_val, y_pred_val_svm)
svm_val_precision = precision_score(y_val, y_pred_val_svm, average='weighted')
svm_val_recall = recall_score(y_val, y_pred_val_svm, average='weighted')

print("Best SVM Model:", svm_grid_search.best_params_)
print("SVM Validation Accuracy:", svm_val_accuracy)
print("SVM Validation Precision:", svm_val_precision)
print("SVM Validation Recall:", svm_val_recall)
print("SVM validation Accuracy:", svm_val_accuracy)

Best SVM Model: {'svc__C': 20, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
SVM Validation Accuracy: 0.9603333333333334
SVM Validation Precision: 0.9605408898270671
SVM Validation Recall: 0.9603333333333334
SVM validation Accuracy: 0.9603333333333334


In [None]:
joblib.dump(best_svm_model, get_svm_model_name())

# Creating Random Forest Classifier Model

In [18]:


rf_param_grid = {'n_estimators': [100, 200, 300], 
                 'max_depth': [None, 1, 5, 10, 20], 
                 'min_samples_split': [5, 10, 20],
                 'min_samples_leaf': [1, 2, 4]}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(x_train, y_train)
best_rf_model = rf_grid_search.best_estimator_

y_pred_val_rf = best_rf_model.predict(x_val)

rf_val_accuracy = accuracy_score(y_val, y_pred_val_rf)
rf_val_precision = precision_score(y_val, y_pred_val_rf, average='weighted')
rf_val_recall = recall_score(y_val, y_pred_val_rf, average='weighted')

print("Best Random Forest Model:", rf_grid_search.best_params_)
print("Random Forest Validation Accuracy:", rf_val_accuracy)
print("Random Forest Validation Precision:", rf_val_precision)
print("Random Forest Validation Recall:", rf_val_recall)

Best Random Forest Model: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Random Forest Validation Accuracy: 0.961
Random Forest Validation Precision: 0.9611020024420506
Random Forest Validation Recall: 0.961


In [None]:
joblib.dump(best_rf_model, get_rf_model_name())

# Logistic Regression

In [21]:
log_reg = LogisticRegression(solver='saga', max_iter=500, multi_class='multinomial', random_state=42)
param_grid_lr = {'C': [0.01, 0.1, 1], 'max_iter': [100, 500, 1000]}
grid_search_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

grid_search_lr.fit(x_train, y_train)

best_model_lr = grid_search_lr.best_estimator_
print(f"Best hyperparameters: {grid_search_lr.best_params_}")
y_pred_val_lr = best_model_lr.predict(x_val)

lr_accuracy = accuracy_score(y_val, y_pred_val_lr)

print(f"lr Accuracy: {lr_accuracy}")

print("Classification Report:")
print(classification_report(y_test, y_pred_val_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_val_lr))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best hyperparameters: {'C': 0.01, 'max_iter': 1000}
lr Accuracy: 0.9156666666666666
Classification Report:
              precision    recall  f1-score   support

           0       0.10      0.10      0.10       305
           1       0.10      0.11      0.11       333
           2       0.09      0.09      0.09       297
           3       0.10      0.10      0.10       320
           4       0.09      0.09      0.09       300
           5       0.10      0.10      0.10       259
           6       0.10      0.11      0.11       289
           7       0.14      0.14      0.14       315
           8       0.10      0.09      0.10       281
           9       0.10      0.10      0.10       301

    accuracy                           0.10      3000
   macro avg       0.10      0.10      0.10      3000
weighted avg       0.10      0.10      0.10      3000

Confusion Matrix:
[[29 51 29 29 29 29 30 28 25 26]
 [31 36 36 36 38 25 29 



In [None]:
joblib.dump(best_model_lr, get_lr_model_name())

In [23]:
models = [best_svm_model, best_model_lr, best_rf_model]

In [24]:
best_model_final = max(models, key=lambda x: get_score(x, x_val, y_val))

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=20, gamma=0.001))]), accuracy score: 0.9603333333333334 
LogisticRegression(C=0.01, max_iter=1000, multi_class='multinomial',
                   random_state=42, solver='saga'), accuracy score: 0.9156666666666666 
RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=300,
                       n_jobs=-1, random_state=42), accuracy score: 0.961 


In [25]:
best_model_final

In [None]:
joblib.dump(best_model_final, get_best_model_name())

['best_model_final_30.pkl']

In [27]:
# now I want to test all models on the test set
y_pred_test_svm = best_svm_model.predict(x_test)
y_pred_test_rf = best_rf_model.predict(x_test)
y_pred_test_lr = best_model_lr.predict(x_test)

print("SVM Test Accuracy:", accuracy_score(y_test, y_pred_test_svm))
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_test_rf))
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_test_lr))


SVM Test Accuracy: 0.9623333333333334
Random Forest Test Accuracy: 0.952
Logistic Regression Test Accuracy: 0.919


In [None]:
# estimators = [
#     ("random_forest_clf", best_rf_model),
#     ("svm_clf", best_svm_model),
#     ("logistic_regression", best_model_lr)
# ]

# voting_clf = VotingClassifier(estimators, voting='hard')

