In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

data_dir = 'dimension_reduced_data_100_res_95_var'

use_pca = True
use_lda = True

if use_pca and not use_lda:
    print('using pca model')
    X_train = np.load(f'{data_dir}/X_train_pca.npy')
    X_val = np.load(f'{data_dir}/X_val_pca.npy')

elif use_lda and not use_pca:
    print('using lda model')
    X_train = np.load(f'{data_dir}/X_train_lda.npy')
    X_val = np.load(f'{data_dir}/X_val_lda.npy')

elif use_pca and use_lda:
    print('using pca and lda model')
    X_train = np.load(f'{data_dir}/X_train_pca_lda.npy')
    X_val = np.load(f'{data_dir}/X_val_pca_lda.npy')

y_train = np.load(f'{data_dir}/y_train.npy')
y_val = np.load(f'{data_dir}/y_val.npy')

print(X_train.shape)

# Define base learners
svm = SVC(kernel='rbf', C=1, gamma='auto')
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# Train base learners
svm.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
knn.fit(X_train, y_train)

# Make predictions
yp_train_svm = svm.predict(X_train)
yp_train_rf = random_forest.predict(X_train)
yp_train_knn = knn.predict(X_train)

# Ensemble predictions (majority voting)
yp_train_ensemble = np.array([np.argmax(np.bincount([yp_train_svm[i], yp_train_rf[i], yp_train_knn[i]])) for i in range(len(yp_train_svm))])

# Calculate training accuracy
training_accuracy = accuracy_score(y_train, yp_train_ensemble)

# Make predictions on validation data
yp_val_svm = svm.predict(X_val)
yp_val_rf = random_forest.predict(X_val)
yp_val_knn = knn.predict(X_val)

# Ensemble predictions for validation data
yp_val_ensemble = np.array([np.argmax(np.bincount([yp_val_svm[i], yp_val_rf[i], yp_val_knn[i]])) for i in range(len(yp_val_svm))])

# Calculate validation accuracy
validation_accuracy = accuracy_score(y_val, yp_val_ensemble)



label_mapping = {'amarisian': 0, 'banmingkai': 1, 'chenziang': 2, 'chientingwei': 3, 'gowdarachandrashekarappasrivarsha': 4, 'huangjiaoyan': 5, 'kodipunzulanandini': 6, 'lishumeng': 7, 'liuhongji': 8, 'lozanoroberto': 9, 'manglaniroshanlakhi': 10, 'mendonakshay': 11, 'negiparth': 12, 'oraisisaac': 13, 'perambuduruvishnu': 14, 'pereiranerissagodfrey': 15, 'ravijayanthidhanasekar': 16, 'sampagaonrahul': 17, 'selinayu': 18, 'shahmanali': 19, 'sivarajusairevanth': 20, 'somaniachal': 21, 'upadhyevaishnavi': 22, 'vanderlindenilona': 23, 'vennavellirajashekarreddy': 24, 'virvadianisargjyotin': 25, 'wukaiyue': 26, 'yashasvi': 27, 'zhangyuanzhen': 28, 'zhouchuandi': 29, 'zotaharsh': 30, 'zuluagagonzalezisabel': 31}
labels_ordered = [label for label, index in sorted(label_mapping.items(), key=lambda item: item[1])]




# Calculate confusion matrix for validation data
cm = confusion_matrix(y_val, yp_val_ensemble, labels=range(len(labels_ordered)))
print(y_val,yp_val_ensemble)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels_ordered, yticklabels=labels_ordered)
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

print("Accuracy:", training_accuracy, validation_accuracy)


using pca and lda model


FileNotFoundError: [Errno 2] No such file or directory: 'dimension_reduced_data_100_res_90_var/X_train_pca_lda.npy'