## MODULE FOR TRAINING DIFFERENT CLASSIFICATION ALGORITHM AND OBTAINING RESULTS IN LINE WITH PROF REQUIREMENTS

In [None]:
# Set batch size to process in chunks to prevent memory issues
BATCH_SIZE = 10

# Minimum Euclidean Distance Classifier with dual batch processing
def minimum_euclidean_classifier(train_data, train_labels, test_data, batch_size=BATCH_SIZE):
    n_test = test_data.shape[0]
    n_train = train_data.shape[0]
    predicted_labels = np.zeros(n_test)

    for i in range(0, n_test, batch_size):  # Process test data in chunks
        end_test_idx = min(i + batch_size, n_test)
        test_batch = test_data[i:end_test_idx]
        
        min_distances = np.full(end_test_idx - i, np.inf)  # To store the minimum distances for each test example
        best_labels = np.zeros(end_test_idx - i)           # To store the best labels for each test example

        for j in range(0, n_train, batch_size):  # Process train data in chunks
            end_train_idx = min(j + batch_size, n_train)
            train_batch = train_data[j:end_train_idx]
            
            # Compute Euclidean distances between test batch and current train batch
            distances = np.linalg.norm(test_batch[:, np.newaxis] - train_batch, axis=2)
            
            # Find the minimum distance and corresponding label
            min_indices = np.argmin(distances, axis=1)
            current_distances = np.min(distances, axis=1)
            
            # Update the best labels for each test sample
            for idx, dist in enumerate(current_distances):
                if dist < min_distances[idx]:
                    min_distances[idx] = dist
                    best_labels[idx] = train_labels[j + min_indices[idx]]

        predicted_labels[i:end_test_idx] = best_labels

    return predicted_labels

# KNN Classifier with batch processing
def knn_classifier(train_data, train_labels, test_data, k=15, batch_size=10):
    n_test = test_data.shape[0]
    predicted_labels = np.zeros(n_test)

    for i in range(0, n_test, batch_size):
        end_idx = min(i + batch_size, n_test)
        test_batch = test_data[i:end_idx]
        
        # Calculate distances between test_batch and all training samples
        distances = np.linalg.norm(test_batch[:, np.newaxis] - train_data, axis=2)
        
        # Get the indices of the k nearest neighbors
        knn_indices = np.argsort(distances, axis=1)[:, :k]
        
        # Get the labels of the k nearest neighbors
        knn_labels = train_labels[knn_indices]
        
        # Predict the label as the most common label among the k nearest neighbors
        for idx, neighbors in enumerate(knn_labels):
            predicted_labels[i + idx] = np.bincount(neighbors.astype(int)).argmax()

    return predicted_labels

# Neural Network Classifier with sigmoid activation function
def neural_network(X_train, y_train, X_test, hidden_layer_size=15, num_epochs=500, learning_rate=3):
    np.random.seed(42)
    
    # Convert y_train to integer type, if it's not already
    y_train = y_train.astype(int)
    
    n_input = X_train.shape[1]
    n_output = len(np.unique(y_train))

    # Initialize weights and biases
    W1 = 0.01 * np.random.randn(n_input, hidden_layer_size)
    b1 = np.zeros((1, hidden_layer_size))
    W2 = 0.01 * np.random.randn(hidden_layer_size, n_output)
    b2 = np.zeros((1, n_output))

    # Training loop
    for epoch in range(num_epochs):
        # Forward pass with sigmoid activation
        hidden_layer = 1 / (1 + np.exp(-(np.dot(X_train, W1) + b1)))  # Sigmoid activation
        scores = np.dot(hidden_layer, W2) + b2

        # Softmax
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        # Compute loss (Cross-Entropy)
        loss = -np.sum(np.log(probs[np.arange(len(y_train)), y_train])) / len(y_train)

        # Backpropagation
        dscores = probs
        dscores[np.arange(len(y_train)), y_train] -= 1
        dscores /= len(y_train)

        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis=0, keepdims=True)
        dhidden = np.dot(dscores, W2.T) * hidden_layer * (1 - hidden_layer)  # Gradient for sigmoid

        dW1 = np.dot(X_train.T, dhidden)
        db1 = np.sum(dhidden, axis=0, keepdims=True)

        # Update weights
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2

    # Prediction for test data
    hidden_layer = 1 / (1 + np.exp(-(np.dot(X_test, W1) + b1)))  # Sigmoid activation
    scores = np.dot(hidden_layer, W2) + b2
    predicted_labels = np.argmax(scores, axis=1)

    return predicted_labels

# Function to evaluate performance of classifiers
def evaluate_performance(X_train, y_train, X_test, y_test):
    performance_table = {}

    # Minimum Euclidean Distance Classifier
    start_time = time.time()
    predicted_labels = minimum_euclidean_classifier(X_train, y_train, X_test)
    test_time = time.time() - start_time
    accuracy = np.mean(predicted_labels == y_test)
    performance_table["Min Euclidean"] = {
        "Accuracy": accuracy,
        "Train Time": test_time,
        "Test Time": test_time
    }

    # KNN Classifier
    start_time = time.time()
    predicted_labels = knn_classifier(X_train, y_train, X_test, k=15, batch_size=10)
    test_time = time.time() - start_time
    accuracy = np.mean(predicted_labels == y_test)
    performance_table["KNN"] = {
        "Accuracy": accuracy,
        "Train Time": test_time, 
        "Test Time": test_time
    }

    # Neural Network Classifier
    start_time = time.time()
    predicted_labels = neural_network(X_train, y_train, X_test)
    test_time = time.time() - start_time
    accuracy = np.mean(predicted_labels == y_test)
    performance_table["Neural Network"] = {
        "Accuracy": accuracy,
        "Train Time": test_time,
        "Test Time": 0
    }

    # Convert performance table to DataFrame for better readability
    df_performance = pd.DataFrame.from_dict(performance_table, orient='index')
    return df_performance

## FUNCTION FOR PREPARING DATA FOR THE TRAINING

In [None]:
# function for preparing the dataset for training
def split_data(df, target_column, train_size=0.7):
    
    # Shuffle the data
    df = df.sample(frac=1).reset_index(drop=True)

    # Convert the DataFrame to a NumPy array
    data = df.to_numpy()

    # Get the index for splitting based on the train size ratio
    split_index = int(len(data) * train_size)

    # Split the data into features and labels
    X = data[:, :-1]  # All columns except the last one (assuming target is the last column)
    y = data[:, -1]   # The last column (target labels)

    # Split into training and testing sets
    X_train = X[:split_index, :]
    y_train = y[:split_index]

    X_test = X[split_index:, :]
    y_test = y[split_index:]

    return X_train, y_train, X_test, y_test

## PREPARE DATA USING THE ABOVE FUNCTION
Input dataset here should be dataframe with label column as the last column

In [None]:
# Split the DataFrame into X_train, y_train, X_test, and y_test
X_train, y_train, X_test, y_test = split_data(nX, target_column='label', train_size=0.7)

# Display the result shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

## TRAIN AND PRINT PERFORMANCE REPORT

In [None]:
performance_results = evaluate_performance(X_train, y_train, X_test, y_test)
print(performance_results)