In [1]:
import struct
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import joblib
import struct
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import time
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler


In [2]:
images = np.load("/kaggle/input/emnist-preprocess/images.npy")
labels = np.load("/kaggle/input/emnist-preprocess/labels.npy")

In [None]:
print(images.shape)
print(labels.shape)

In [4]:
# One hot encoding
labels = to_categorical(labels)

In [6]:
train_x,test_x,train_y,test_y = train_test_split(images,labels,test_size=0.2,random_state = 42)

# Model setup

## CNN

In [8]:
cnn_model = Sequential()
cnn_model.add(Input(shape=(28, 28, 1)))
cnn_model.add(Conv2D(32, (3, 3), activation='relu'))
cnn_model.add(MaxPooling2D((2, 2)))
cnn_model.add(Flatten())
cnn_model.add(Dense(512, activation='relu'))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(46, activation='softmax'))
cnn_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
cnn_model.summary()

In [None]:
ERS = EarlyStopping(monitor='val_accuracy',min_delta=0,verbose=0,restore_best_weights = True,patience=10,mode='max')

history = cnn_model.fit(train_x.reshape(train_x.shape[0], 28, 28, 1), train_y, epochs=100,validation_data=(test_x, test_y), callbacks=[ERS], verbose=2)

In [11]:
cnn_model.save('/kaggle/working/emnist_cnn_model.keras')

In [None]:
# Plot training and validation accuracy
plt.figure(figsize=(12, 6))

# Plot training accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Train and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot training and validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Train and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

## KNN

In [None]:
train_x_reshaped = train_x.reshape(train_x.shape[0], 28*28)
train_y_int = np.argmax(train_y, axis=1)
k_values = range(3, 12)
cv_scores = []

In [None]:
for k in k_values:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn_model, train_x_reshaped, train_y_int, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_scores, marker='o')
plt.title('KNN Cross-Validation Performance with Different k Values')
plt.xlabel('k (Number of Neighbors)')
plt.ylabel('Mean Accuracy (from Cross-Validation)')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [None]:
best_k = k_values[np.argmax(cv_scores)]
best_knn_model = KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1)
best_knn_model.fit(train_x_reshaped, train_y_int)
joblib.dump(best_knn_model, '/kaggle/working/emnist_knn_model.pkl')

In [None]:
test_x_reshaped = test_x.reshape(test_x.shape[0], 28*28)
test_y_int = np.argmax(test_y, axis=1)
test_predictions = best_knn_model.predict(test_x_reshaped)
test_accuracy = np.mean(test_predictions == test_y_int)

print(f"Test Accuracy with k={best_k}: {test_accuracy * 100:.2f}%")

## SVM

In [None]:
kernels = ['linear', 'rbf']
C_values = [1, 0.1]

In [None]:
for kernel in kernels:
    for C in C_values:
        print(f"Training SVM with kernel={kernel} and C={C}")
        
        # Initialize the SVM model
        model = SVC(kernel=kernel, C=C, random_state=42)
        
        # Train the model
        model.fit(train_x_reshaped, train_y_int)
        
        # Save the model
        model_filename = f'svm_model_{kernel}_C{C}.pkl'
        joblib.dump(model, model_filename)
        print(f"Model saved as {model_filename}\n")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf']
}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(train_x_reshaped, train_y_int)
print(grid.best_params_)

# Ensemble method (voting)

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(estimators=[
    ('cnn', cnn_model),
    ('knn', best_knn_model),
    ('svm', grid.best_estimator_)
], voting='hard')
ensemble_model.fit(train_x_reshaped, train_y_int)

In [None]:
from sklearn.metrics import classification_report

test_predictions = cnn_model.predict(test_x)
print(classification_report(test_y_int, test_predictions.argmax(axis=1)))

# Evaluation

## CNN Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Evaluate CNN model on test data
cnn_test_predictions = cnn_model.predict(test_x)
cnn_test_predictions_classes = cnn_test_predictions.argmax(axis=1)

# Print classification report
print("CNN Model Classification Report:")
print(classification_report(test_y_int, cnn_test_predictions_classes))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(test_y_int, cnn_test_predictions_classes), annot=True, fmt='d', cmap='Blues')
plt.title('CNN Model Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

## KNN Model

In [None]:
# Evaluate KNN model on test data
knn_test_predictions = best_knn_model.predict(test_x_reshaped)

# Print classification report
print(f"KNN Model Classification Report (k={best_k}):")
print(classification_report(test_y_int, knn_test_predictions))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(test_y_int, knn_test_predictions), annot=True, fmt='d', cmap='Blues')
plt.title(f'KNN Model Confusion Matrix (k={best_k})')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

## SVM Model

In [None]:
# Load the best SVM model (assuming it's saved as 'svm_model_rbf_C1.pkl')
best_svm_model = grid.best_estimator_

# Evaluate SVM model on test data
svm_test_predictions = best_svm_model.predict(test_x_reshaped)

# Print classification report
print("SVM Model Classification Report (kernel=rbf, C=1):")
print(classification_report(test_y_int, svm_test_predictions))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(test_y_int, svm_test_predictions), annot=True, fmt='d', cmap='Blues')
plt.title('SVM Model Confusion Matrix (kernel=rbf, C=1)')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

## Ensemble Model

In [None]:
# Evaluate Ensemble model on test data
ensemble_test_predictions = ensemble_model.predict(test_x_reshaped)

# Print classification report
print("Ensemble Model Classification Report:")
print(classification_report(test_y_int, ensemble_test_predictions))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(test_y_int, ensemble_test_predictions), annot=True, fmt='d', cmap='Blues')
plt.title('Ensemble Model Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()