In [3]:
# Number of training images = 5000
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def load_data(data_dir):
    images = []
    labels = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".jpg"):
            label = int(filename.split("_")[-1].split('.')[0])
            labels.append(label)
            image = cv2.imread(os.path.join(data_dir, filename), cv2.IMREAD_GRAYSCALE)
            images.append(image)
    return np.array(images), np.array(labels)


data_dir = 'Chinese_MINST_Dataset/data/data'

images, labels = load_data(data_dir)

images = images / 255.0

images = images.reshape((images.shape[0], 64 * 64))

X_train, X_test, y_train, y_test = train_test_split(images, labels, train_size=5000, test_size=1000, stratify=labels)

scaler = StandardScaler()
pca = PCA(n_components=100)

knn = KNeighborsClassifier(n_neighbors=3)
knn_pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('knn', knn)
])

dt = DecisionTreeClassifier()
dt_pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('dt', dt)
])

sgd = SGDClassifier(max_iter=250)
sgd_pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('sgd', sgd)
])

classifiers = {
    'KNN': knn_pipeline,
    'Decision Tree': dt_pipeline,
    'SGD': sgd_pipeline
}
results = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='macro'),
        'recall': recall_score(y_test, y_pred, average='macro'),
        'f1_score': f1_score(y_test, y_pred, average='macro'),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }

for name, result in results.items():
    print(f"Results for {name}:")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print(f"F1 Score: {result['f1_score']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}\n")

Results for KNN:
Accuracy: 0.606
Precision: 0.6034469046110191
Recall: 0.6058344640434192
F1 Score: 0.5972598085000878
Confusion Matrix:
[[55  0  0  1  1  0  2  0  0  0  1  4  1  1  0]
 [ 1 62  1  0  0  0  1  0  0  0  0  0  2  0  0]
 [ 0  9 46  7  0  1  1  1  0  0  0  0  1  0  1]
 [ 2  4 26 30  0  3  0  0  0  0  0  1  0  0  0]
 [ 2  1  0  0 53  0  1  1  0  3  1  0  0  3  2]
 [ 1  0  5 14  3 33  0  2  2  2  0  4  0  0  1]
 [ 1  9  0  0  3  0 40  1  2  5  1  0  2  1  1]
 [ 2  0  2  4  3  2 11 29  1  9  1  1  0  0  2]
 [ 0  1  0  0  2  0  2  2 60  0  0  0  0  0  0]
 [ 2  1  0  1  3  4  5  4  2 29  1  0  0  3 12]
 [ 1  0  2  0  0  0  3  3  0  2 45  1 10  0  0]
 [ 6  0  0  2  6  4  1  4  2  2  1 27  0 12  0]
 [ 0  1  3  2  1  1  3  1  0  0 21  1 30  1  1]
 [ 9  0  0  0  5  1  5  2  0  3  0  8  1 33  0]
 [ 0  0  1  1 10  0  3  7  2  6  0  1  1  0 34]]

Results for Decision Tree:
Accuracy: 0.393
Precision: 0.39817979029181344
Recall: 0.3929745213327303
F1 Score: 0.39358537342826205
Confusion 



In [9]:
# Number of training images = 10000
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def load_data(data_dir):
    images = []
    labels = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".jpg"):
            label = int(filename.split("_")[-1].split('.')[0])
            labels.append(label)
            image = cv2.imread(os.path.join(data_dir, filename), cv2.IMREAD_GRAYSCALE)
            images.append(image)
    return np.array(images), np.array(labels)


data_dir = 'Chinese_MINST_Dataset/data/data'

images, labels = load_data(data_dir)

images = images / 255.0

images = images.reshape((images.shape[0], 64 * 64))

X_train, X_test, y_train, y_test = train_test_split(images, labels, train_size=10000, test_size=1000, stratify=labels)

scaler = StandardScaler()
pca = PCA(n_components=100)

knn = KNeighborsClassifier(n_neighbors=3)
knn_pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('knn', knn)
])

dt = DecisionTreeClassifier()
dt_pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('dt', dt)
])

sgd = SGDClassifier(max_iter=250)
sgd_pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('sgd', sgd)
])

classifiers = {
    'KNN': knn_pipeline,
    'Decision Tree': dt_pipeline,
    'SGD': sgd_pipeline
}
results = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='macro'),
        'recall': recall_score(y_test, y_pred, average='macro'),
        'f1_score': f1_score(y_test, y_pred, average='macro'),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }

for name, result in results.items():
    print(f"Results for {name}:")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print(f"F1 Score: {result['f1_score']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}\n")


Results for KNN:
Accuracy: 0.718
Precision: 0.7211243222575905
Recall: 0.7179255238956731
F1 Score: 0.7125351387937517
Confusion Matrix:
[[59  0  1  1  2  0  1  0  0  0  0  0  1  2  0]
 [ 0 65  1  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0 14 44  7  0  1  0  0  1  0  0  0  0  0  0]
 [ 1  7  9 42  0  5  0  0  0  0  0  0  1  1  0]
 [ 5  0  0  1 51  1  0  1  1  1  0  0  0  2  4]
 [ 0  0  6 10  1 43  0  1  1  2  0  2  0  0  0]
 [ 0  4  2  0  0  1 51  3  1  2  2  0  0  0  0]
 [ 0  0  4  2  2  6  5 40  2  3  0  0  0  0  3]
 [ 0  0  1  0  1  0  0  0 65  0  0  0  0  0  0]
 [ 4  1  0  0  2  0  8  9  1 30  1  1  0  1  9]
 [ 0  1  0  0  0  0  5  0  0  0 50  0 10  0  0]
 [ 1  0  0  0  1  4  1  1  1  1  0 45  0 11  1]
 [ 1  0  0  5  0  0  1  0  0  0 21  0 38  0  1]
 [ 2  0  0  0  3  0  7  1  1  0  0  4  0 49  0]
 [ 1  0  0  0  5  0  2  3  6  1  0  1  1  0 46]]

Results for Decision Tree:
Accuracy: 0.466
Precision: 0.47311342517990435
Recall: 0.4657771747323985
F1 Score: 0.4677725876567445
Confusion M

