In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Data preprocessing
train_df = pd.read_csv("mnist_train.csv")
test_df = pd.read_csv("mnist_test.csv")

y_train = train_df.iloc[:, 0].values
X_train = train_df.iloc[:, 1:].values

y_test = test_df.iloc[:, 0].values
X_test = test_df.iloc[:, 1:].values

X_train = X_train / 255.0
X_test = X_test / 255.0

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- Logistic Regression ---
print("Logistic Regression")

lr_model = LogisticRegression(max_iter=1000,)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))

# --- Random Forest Classifier ---
print("Random Forest Classifier")

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

# --- K-Nearest Neighbors ---
print("K-Nearest Neighbors")

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_preds = knn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, knn_preds))
print(classification_report(y_test, knn_preds))

# --- Support Vector Machine ---
print("SVM (on subset due to slowness)")

# For performance reasons, train SVM on a smaller subset
X_train_svm, _, y_train_svm, _ = train_test_split(X_train, y_train, train_size=10000, stratify=y_train, random_state=42)
X_test_svm, _, y_test_svm, _ = train_test_split(X_test, y_test, train_size=2000, stratify=y_test, random_state=42)

svm_model = SVC(kernel='linear')
svm_model.fit(X_train_svm, y_train_svm)
svm_preds = svm_model.predict(X_test_svm)

print("Accuracy on subset:", accuracy_score(y_test_svm, svm_preds))
print(classification_report(y_test_svm, svm_preds))

Logistic Regression
Accuracy: 0.9214921492149215
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       980
           1       0.96      0.98      0.97      1135
           2       0.92      0.89      0.90      1032
           3       0.90      0.91      0.90      1010
           4       0.94      0.93      0.93       982
           5       0.89      0.87      0.88       892
           6       0.94      0.95      0.95       958
           7       0.93      0.92      0.92      1027
           8       0.87      0.88      0.88       974
           9       0.91      0.92      0.91      1009

    accuracy                           0.92      9999
   macro avg       0.92      0.92      0.92      9999
weighted avg       0.92      0.92      0.92      9999

Random Forest Classifier
Accuracy: 0.9686968696869687
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.99      