In [12]:
# load necessary packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [13]:
# Load the data
train_df = pd.read_csv('digit-train.csv')
test_df = pd.read_csv('digit-test.csv')

train_df.head()
test_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Split the data into features and labels
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

In [15]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [21]:
# KNN
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': range(1,11)}

knn_gs = GridSearchCV(knn, params_knn, cv=5)
knn_gs.fit(X_train, y_train)

# Print the best hyperparameters
print("KNN Best Params:", knn_gs.best_params_)

# Evaluate the model on the validation set
knn_val_acc = knn_gs.score(X_val, y_val)
print("KNN Validation Accuracy:", knn_val_acc)

# Evaluate the model on the test set
knn_test_acc = knn_gs.score(X_test, y_test)
print("KNN Test Accuracy:", knn_test_acc)

KNN Best Params: {'n_neighbors': 1}
KNN Validation Accuracy: 0.8761904761904762
KNN Test Accuracy: 0.8870890900428776


In [18]:
# SVM
svm = SVC()
params_svm = {'C': [0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}

svm_gs = GridSearchCV(svm, params_svm, cv=5)
svm_gs.fit(X_train, y_train)

# Print the best hyperparameters
print("SVM Best Params:", svm_gs.best_params_)

# Evaluate the model on the validation set
svm_val_acc = svm_gs.score(X_val, y_val)
print("SVM Validation Accuracy:", svm_val_acc)

# Evaluate the model on the test set
svm_test_acc = svm_gs.score(X_test, y_test)
print("SVM Test Accuracy:", svm_test_acc)

SVM Best Params: {'C': 10, 'gamma': 0.001}
SVM Validation Accuracy: 0.925
SVM Test Accuracy: 0.9282991900905193


In [19]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
params_rf = {'n_estimators': [100, 500], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}

rf_gs = GridSearchCV(rf, params_rf, cv=5)
rf_gs.fit(X_train, y_train)

# Print the best hyperparameters
print("Random Forest Best Params:", rf_gs.best_params_)

# Evaluate the model on the validation set
rf_val_acc = rf_gs.score(X_val, y_val)
print("Random Forest Validation Accuracy:", rf_val_acc)

# Evaluate the model on the test set
rf_test_acc = rf_gs.score(X_test, y_test)
print("Random Forest Test Accuracy:", rf_test_acc)

Random Forest Best Params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
Random Forest Validation Accuracy: 0.9452380952380952
Random Forest Test Accuracy: 0.9359218675559791
