#### Extracting the data set

In [43]:
import numpy as np

# Load data
data_train = np.load(r"Kannada_MNIST_datataset_paper\Kannada_MNIST_npz\Kannada_MNIST\X_kannada_MNIST_train.npz")
data_test = np.load(r"Kannada_MNIST_datataset_paper\Kannada_MNIST_npz\Kannada_MNIST\X_kannada_MNIST_test.npz")

# Extract arrays from npz file
X_train = data_train['arr_0']
X_test = data_test['arr_0']

# Load labels
y_train = np.load(r"Kannada_MNIST_datataset_paper\Kannada_MNIST_npz\Kannada_MNIST\y_kannada_MNIST_train.npz")['arr_0']
y_test = np.load(r"Kannada_MNIST_datataset_paper\Kannada_MNIST_npz\Kannada_MNIST\y_kannada_MNIST_test.npz")['arr_0']

# Reshape the data to flatten the images
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)


#### PCA is a dimensionality reduction technique that aims to capture the most significant patterns in the data.

In [49]:
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelBinarizer

def pca_and_evaluate(X_train_flat, X_test_flat, y_train, y_test, n_component):
    # Perform PCA
    pca = PCA(n_components=n_component)  
    X_train_pca = pca.fit_transform(X_train_flat)
    X_test_pca = pca.transform(X_test_flat)
    
    # Initialize lists to store results
    results = []

    # Initialize models
    models = {
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Naive Bayes': GaussianNB(),
        'K-NN': KNeighborsClassifier(),
        'SVM': SVC()
    }

    # Train and evaluate each model
    for name, model in models.items():
        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)

        # Calculate metrics
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Convert labels to binary format for ROC-AUC calculation
        lb = LabelBinarizer()
        y_test_bin = lb.fit_transform(y_test)
        y_pred_bin = lb.transform(y_pred)
        roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='macro')

        # Append results to the list
        results.append({
            'Model': name,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'Confusion Matrix': conf_matrix,
            'ROC-AUC Score': roc_auc
        })

    # Convert list of dictionaries to DataFrame
    results_df = pd.DataFrame(results)

    # Return the results DataFrame
    return results_df


#### Performing PCA (Principal Component Analysis)

In [50]:
pca_and_evaluate(X_train_flat, X_test_flat, y_train, y_test, 10)

Unnamed: 0,Model,Precision,Recall,F1-score,Confusion Matrix,ROC-AUC Score
0,Decision Tree,0.798218,0.7974,0.796624,"[[721, 146, 15, 38, 5, 4, 2, 12, 35, 22], [77,...",0.887444
1,Random Forest,0.877516,0.8744,0.873687,"[[783, 151, 3, 29, 3, 0, 2, 3, 19, 7], [37, 90...",0.930222
2,Naive Bayes,0.779827,0.7727,0.771329,"[[609, 229, 16, 70, 8, 0, 2, 13, 44, 9], [25, ...",0.873722
3,K-NN,0.882688,0.8794,0.878293,"[[772, 167, 3, 29, 2, 0, 2, 3, 16, 6], [15, 94...",0.933
4,SVM,0.890241,0.8868,0.886259,"[[811, 139, 1, 22, 6, 0, 1, 2, 15, 3], [20, 92...",0.937111


In [51]:
pca_and_evaluate(X_train_flat, X_test_flat, y_train, y_test, 15)

Unnamed: 0,Model,Precision,Recall,F1-score,Confusion Matrix,ROC-AUC Score
0,Decision Tree,0.804511,0.8044,0.803411,"[[711, 158, 13, 31, 8, 3, 5, 13, 43, 15], [89,...",0.891333
1,Random Forest,0.895206,0.892,0.891444,"[[797, 156, 1, 20, 4, 0, 2, 3, 14, 3], [21, 92...",0.94
2,Naive Bayes,0.787036,0.7837,0.782144,"[[602, 236, 19, 47, 11, 0, 2, 13, 61, 9], [17,...",0.879833
3,K-NN,0.913676,0.9101,0.90951,"[[801, 163, 2, 13, 5, 0, 1, 2, 10, 3], [15, 96...",0.950056
4,SVM,0.917075,0.9147,0.914221,"[[826, 128, 1, 17, 5, 0, 0, 5, 13, 5], [14, 95...",0.952611


In [52]:
pca_and_evaluate(X_train_flat, X_test_flat, y_train, y_test, 20)

Unnamed: 0,Model,Precision,Recall,F1-score,Confusion Matrix,ROC-AUC Score
0,Decision Tree,0.810308,0.811,0.810146,"[[737, 138, 6, 30, 4, 4, 10, 11, 38, 22], [83,...",0.895
1,Random Forest,0.903513,0.8998,0.899123,"[[801, 149, 1, 21, 4, 0, 2, 2, 14, 6], [12, 93...",0.944333
2,Naive Bayes,0.799701,0.7965,0.795104,"[[621, 230, 15, 47, 9, 0, 2, 11, 56, 9], [16, ...",0.886944
3,K-NN,0.924532,0.9207,0.920505,"[[816, 150, 1, 13, 7, 0, 0, 1, 6, 6], [10, 971...",0.955944
4,SVM,0.933504,0.9313,0.930951,"[[854, 113, 0, 15, 3, 0, 0, 3, 9, 3], [8, 964,...",0.961833


In [53]:
pca_and_evaluate(X_train_flat, X_test_flat, y_train, y_test, 25)

Unnamed: 0,Model,Precision,Recall,F1-score,Confusion Matrix,ROC-AUC Score
0,Decision Tree,0.80592,0.8063,0.805364,"[[734, 147, 8, 30, 5, 10, 5, 10, 35, 16], [83,...",0.892389
1,Random Forest,0.906518,0.9034,0.902858,"[[803, 148, 1, 19, 6, 0, 2, 2, 15, 4], [15, 93...",0.946333
2,Naive Bayes,0.808526,0.8053,0.8041,"[[632, 229, 16, 49, 8, 0, 2, 9, 48, 7], [14, 8...",0.891833
3,K-NN,0.92981,0.9261,0.925852,"[[822, 149, 0, 11, 5, 0, 0, 1, 8, 4], [5, 977,...",0.958944
4,SVM,0.936898,0.9351,0.934803,"[[860, 112, 0, 11, 3, 0, 0, 2, 9, 3], [9, 964,...",0.963944


In [54]:
pca_and_evaluate(X_train_flat, X_test_flat, y_train, y_test, 30)

Unnamed: 0,Model,Precision,Recall,F1-score,Confusion Matrix,ROC-AUC Score
0,Decision Tree,0.804441,0.8052,0.804188,"[[729, 145, 13, 20, 5, 7, 7, 8, 47, 19], [94, ...",0.891778
1,Random Forest,0.906352,0.9035,0.90264,"[[794, 153, 1, 24, 3, 0, 2, 3, 14, 6], [12, 94...",0.946389
2,Naive Bayes,0.815682,0.8128,0.811548,"[[639, 234, 17, 34, 10, 1, 2, 12, 43, 8], [11,...",0.896
3,K-NN,0.934321,0.9309,0.93059,"[[820, 156, 0, 8, 4, 0, 0, 2, 6, 4], [7, 981, ...",0.961611
4,SVM,0.94103,0.9394,0.939102,"[[863, 111, 0, 9, 2, 0, 1, 3, 8, 3], [8, 970, ...",0.966333
