In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import time
import os

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Configuration class to hold hyperparameters
class Config:
    BATCH_SIZE = 128
    EPOCHS = 20
    LEARNING_RATE = 0.001
    PATCH_SIZE = 7  # Size of local patches
    NUM_PATCHES = 16  # Number of patches to extract
    LOCAL_FEATURE_DIM = 32  # Dimension of local features
    GLOBAL_FEATURE_DIM = 128  # Dimension of global features
    HIDDEN_UNITS = 256  # Hidden units in fully connected layer
    DROPOUT_RATE = 0.3  # Dropout rate
    IMG_SIZE = 28  # MNIST image size
    NUM_CLASSES = 10  # Number of digit classes
    VALIDATION_SPLIT = 0.1  # Percentage of training data to use for validation

config = Config()

# Function to load and preprocess the MNIST dataset
def load_and_preprocess_data():
    print("Loading MNIST dataset...")
    # Load MNIST dataset from TensorFlow
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

    # Normalize pixel values to [0, 1]
    x_train = x_train.astype('float32') / 255.0
    x_test = x_test.astype('float32') / 255.0

    # Reshape to add channel dimension (required for TensorFlow)
    x_train = x_train.reshape(-1, 28, 28, 1)
    x_test = x_test.reshape(-1, 28, 28, 1)

    # Convert labels to one-hot encoding
    y_train = tf.keras.utils.to_categorical(y_train, 10)
    y_test = tf.keras.utils.to_categorical(y_test, 10)

    print(f"Training data shape: {x_train.shape}")
    print(f"Training labels shape: {y_train.shape}")
    print(f"Test data shape: {x_test.shape}")
    print(f"Test labels shape: {y_test.shape}")

    return (x_train, y_train), (x_test, y_test)

# Extract local patches from images
# class PatchExtractor(tf.keras.layers.Layer):
#     def __init__(self, patch_size, num_patches):
#         super(PatchExtractor, self).__init__()
#         self.patch_size = patch_size
#         self.num_patches = num_patches

#     def call(self, images):
#         batch_size = tf.shape(images)[0]

#         # Initialize an array to store our patches
#         patches = []

#         for _ in range(self.num_patches):
#             # Generate random starting positions for patches
#             h_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)
#             w_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)

#             # Extract patch
#             patch = images[:, h_start:h_start + self.patch_size, w_start:w_start + self.patch_size, :]
#             patches.append(patch)

#         # Stack all patches along a new dimension
#         patches = tf.stack(patches, axis=1)  # Shape: [batch_size, num_patches, patch_size, patch_size, channels]

#         # Flatten each patch
#         flat_patches = tf.reshape(patches, [batch_size, self.num_patches, self.patch_size * self.patch_size])

#         return flat_patches

# class PatchExtractor(tf.keras.layers.Layer):
#     def __init__(self, patch_size, num_patches):
#         super(PatchExtractor, self).__init__()
#         self.patch_size = patch_size
#         self.num_patches = num_patches

#     def call(self, images):
#         batch_size = tf.shape(images)[0]

#         # Initialize an array to store our patches
#         patches = []

#         for _ in range(self.num_patches):
#             # Generate random starting positions for patches
#             h_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)
#             w_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)

#             # Extract patch
#             patch = images[:, h_start:h_start + self.patch_size, w_start:w_start + self.patch_size, :]
#             patches.append(patch)

#         # Stack all patches along a new dimension
#         patches = tf.stack(patches, axis=1)  # Shape: [batch_size, num_patches, patch_size, patch_size, channels]

#         # Flatten each patch
#         flat_patches = tf.reshape(patches, [batch_size, self.num_patches, self.patch_size * self.patch_size * patches.shape[-1]])

#         return flat_patches


# class PatchExtractor(tf.keras.layers.Layer):
#     def __init__(self, patch_size, num_patches):
#         super(PatchExtractor, self).__init__()
#         self.patch_size = patch_size
#         self.num_patches = num_patches

#     def call(self, images):
#         batch_size = tf.shape(images)[0]

#         # Initialize an array to store our patches
#         patches = []

#         for _ in range(self.num_patches):
#             # Generate random starting positions for patches
#             h_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)
#             w_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)

#             # Extract patch
#             patch = images[:, h_start:h_start + self.patch_size, w_start:w_start + self.patch_size, :]
#             patches.append(patch)

#         # Stack all patches along a new dimension
#         patches = tf.stack(patches, axis=1)  # Shape: [batch_size, num_patches, patch_size, patch_size, channels]

#         # Flatten each patch
#         flat_patches = tf.reshape(patches, [batch_size, self.num_patches, self.patch_size * self.patch_size * patches.shape[-1]])

#         return flat_patches

class PatchExtractor(tf.keras.layers.Layer):
    def __init__(self, patch_size, num_patches):
        super(PatchExtractor, self).__init__()
        self.patch_size = patch_size
        self.num_patches = num_patches

    def call(self, images):
        batch_size = tf.shape(images)[0]

        # Initialize an array to store our patches
        patches = []

        for _ in range(self.num_patches):
            # Generate random starting positions for patches
            h_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)
            w_start = tf.random.uniform(shape=[], maxval=config.IMG_SIZE - self.patch_size, dtype=tf.int32)

            # Extract patch
            patch = images[:, h_start:h_start + self.patch_size, w_start:w_start + self.patch_size, :]
            patches.append(patch)

        # Stack all patches along a new dimension
        patches = tf.stack(patches, axis=1)  # Shape: [batch_size, num_patches, patch_size, patch_size, channels]

        # Debug: Print shapes
        print(f"Input images shape: {images.shape}")
        print(f"Patches shape: {patches.shape}")

        # Flatten each patch
        flat_patches = tf.reshape(patches, [batch_size, self.num_patches, self.patch_size * self.patch_size * patches.shape[-1]])

        # Debug: Print flattened patches shape
        print(f"Flat patches shape: {flat_patches.shape}")

        return flat_patches

# Local Feature Encoder
class LocalFeatureEncoder(tf.keras.layers.Layer):
    def __init__(self, feature_dim):
        super(LocalFeatureEncoder, self).__init__()
        self.feature_dim = feature_dim
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(feature_dim, activation='relu')
        self.norm = tf.keras.layers.LayerNormalization()

    def call(self, patches):
        # Process each patch individually
        x = self.dense1(patches)
        x = self.dense2(x)
        x = self.norm(x)
        return x

# Global Feature Aggregator
class GlobalFeatureAggregator(tf.keras.layers.Layer):
    def __init__(self, feature_dim):
        super(GlobalFeatureAggregator, self).__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=4, key_dim=32)
        self.dense = tf.keras.layers.Dense(feature_dim, activation='relu')
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()

    def call(self, local_features):
        # Self-attention to capture relationships between patches
        attention_output = self.attention(local_features, local_features)
        x = self.norm1(local_features + attention_output)

        # Process the attended features further
        global_features = self.dense(x)
        global_features = self.norm2(global_features)

        # Combine all patch features to get a single global representation
        # through learnable aggregation (attention-weighted sum)
        global_representation = tf.reduce_mean(global_features, axis=1)

        return global_representation

# The full model combining local and global feature extraction
class LocalGlobalDigitClassifier(tf.keras.Model):
    def __init__(self, config):
        super(LocalGlobalDigitClassifier, self).__init__()
        self.config = config

        # Patch extraction layer
        self.patch_extractor = PatchExtractor(
            patch_size=config.PATCH_SIZE,
            num_patches=config.NUM_PATCHES
        )

        # Local feature encoder
        self.local_encoder = LocalFeatureEncoder(
            feature_dim=config.LOCAL_FEATURE_DIM
        )

        # Global feature aggregator
        self.global_aggregator = GlobalFeatureAggregator(
            feature_dim=config.GLOBAL_FEATURE_DIM
        )

        # Final classification layers
        self.dense1 = tf.keras.layers.Dense(config.HIDDEN_UNITS, activation='relu')
        self.dropout = tf.keras.layers.Dropout(config.DROPOUT_RATE)
        self.norm = tf.keras.layers.LayerNormalization()
        self.classifier = tf.keras.layers.Dense(config.NUM_CLASSES, activation='softmax')

    def call(self, images, training=False):
        # Extract patches from images
        patches = self.patch_extractor(images)

        # Encode local features from patches
        local_features = self.local_encoder(patches)

        # Aggregate local features to capture global structure
        global_representation = self.global_aggregator(local_features)

        # Classification based on the global representation
        x = self.dense1(global_representation)
        x = self.dropout(x, training=training)
        x = self.norm(x)
        output = self.classifier(x)

        return output

# Simple baseline model for comparison
def create_baseline_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

# Function to visualize the patches extracted by the model
def visualize_patches(model, images, num_samples=3):
    # Extract patches using the model's patch extractor
    patch_extractor = model.patch_extractor

    plt.figure(figsize=(15, num_samples * 3))
    for i in range(num_samples):
        # Get a single image
        image = images[i:i+1]

        # Extract patches
        patches = patch_extractor(image)
        patches_reshaped = tf.reshape(patches,
                                     [config.NUM_PATCHES,
                                      config.PATCH_SIZE,
                                      config.PATCH_SIZE])

        # Plot original image
        plt.subplot(num_samples, config.NUM_PATCHES + 1, i * (config.NUM_PATCHES + 1) + 1)
        plt.imshow(tf.squeeze(image), cmap='gray')
        plt.title(f"Image {i}")
        plt.axis('off')

        # Plot patches
        for j in range(config.NUM_PATCHES):
            plt.subplot(num_samples, config.NUM_PATCHES + 1, i * (config.NUM_PATCHES + 1) + j + 2)
            plt.imshow(patches_reshaped[j], cmap='gray')
            plt.title(f"Patch {j}")
            plt.axis('off')

    plt.tight_layout()
    plt.savefig("patches_visualization.png")
    plt.close()

# Training function
def train_model(model, x_train, y_train, x_test, y_test, config):
    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Define callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=3,
            min_lr=1e-6
        )
    ]

    # Train the model
    start_time = time.time()
    history = model.fit(
        x_train, y_train,
        batch_size=config.BATCH_SIZE,
        epochs=config.EPOCHS,
        validation_split=config.VALIDATION_SPLIT,
        callbacks=callbacks,
        verbose=1
    )
    training_time = time.time() - start_time

    # Evaluate on test set
    test_start_time = time.time()
    test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)
    test_time = time.time() - test_start_time

    print(f"Test accuracy: {test_accuracy:.4f}")
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Inference time on test set: {test_time:.2f} seconds")

    return history, test_accuracy, training_time, test_time

# Function to analyze predictions and compute metrics
def analyze_results(model, x_test, y_test):
    # Get predictions
    y_pred_probs = model.predict(x_test)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Generate classification report
    report = classification_report(y_true, y_pred, digits=4)
    print("\nClassification Report:")
    print(report)

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig("confusion_matrix.png")
    plt.close()

    # Plot examples of misclassifications
    misclassified_indices = np.where(y_pred != y_true)[0]
    if len(misclassified_indices) > 0:
        plt.figure(figsize=(12, 8))
        num_samples = min(10, len(misclassified_indices))
        for i in range(num_samples):
            idx = misclassified_indices[i]
            plt.subplot(2, 5, i + 1)
            plt.imshow(np.squeeze(x_test[idx]), cmap='gray')
            plt.title(f"True: {y_true[idx]}, Pred: {y_pred[idx]}")
            plt.axis('off')
        plt.savefig("misclassified_examples.png")
        plt.close()

    # Analyze class-wise performance
    plt.figure(figsize=(12, 5))
    class_precision = precision_score(y_true, y_pred, average=None)
    class_recall = recall_score(y_true, y_pred, average=None)

    plt.subplot(1, 2, 1)
    plt.bar(range(10), class_precision)
    plt.title('Precision by Class')
    plt.xlabel('Digit')
    plt.ylabel('Precision')
    plt.ylim([0, 1])

    plt.subplot(1, 2, 2)
    plt.bar(range(10), class_recall)
    plt.title('Recall by Class')
    plt.xlabel('Digit')
    plt.ylabel('Recall')
    plt.ylim([0, 1])

    plt.tight_layout()
    plt.savefig("class_performance.png")
    plt.close()

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'misclassified_indices': misclassified_indices
    }

# Plot training history
def plot_training_history(history):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training')
    plt.plot(history.history['val_accuracy'], label='Validation')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training')
    plt.plot(history.history['val_loss'], label='Validation')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.savefig("training_history.png")
    plt.close()


  # Compare models function
def compare_models(custom_results, baseline_results):
    models = ['Baseline', 'Local-Global Model']
    metrics = {
        'accuracy': [baseline_results['accuracy'], custom_results['accuracy']],
        'precision': [baseline_results['precision'], custom_results['precision']],
        'recall': [baseline_results['recall'], custom_results['recall']],
        'f1': [baseline_results['f1'], custom_results['f1']],
        'training_time': [baseline_results['training_time'], custom_results['training_time']],
        'inference_time': [baseline_results['test_time'], custom_results['test_time']]
    }

    plt.figure(figsize=(14, 10))

    # Performance metrics
    plt.subplot(2, 2, 1)
    x = np.arange(len(models))
    width = 0.2
    plt.bar(x - 1.5*width, metrics['accuracy'], width, label='Accuracy')
    plt.bar(x - 0.5*width, metrics['precision'], width, label='Precision')
    plt.bar(x + 0.5*width, metrics['recall'], width, label='Recall')
    plt.bar(x + 1.5*width, metrics['f1'], width, label='F1')
    plt.xlabel('Models')
    plt.ylabel('Score')
    plt.title('Performance Metrics')
    plt.xticks(x, models)
    plt.ylim([0, 1])
    plt.legend()

    # Training and inference time
    plt.subplot(2, 2, 2)
    plt.bar(x - 0.15, metrics['training_time'], width=0.3, label='Training Time (s)')
    plt.bar(x + 0.15, metrics['inference_time'], width=0.3, label='Inference Time (s)')
    plt.xlabel('Models')
    plt.ylabel('Time (s)')
    plt.title('Computational Efficiency')
    plt.xticks(x, models)
    plt.legend()

    # Confusion matrix differences
    plt.subplot(2, 1, 2)
    diff_cm = custom_results['confusion_matrix'] - baseline_results['confusion_matrix']
    sns.heatmap(diff_cm, annot=True, fmt='d', cmap='coolwarm', center=0)
    plt.title('Confusion Matrix Difference (Local-Global - Baseline)')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')

    plt.tight_layout()
    plt.savefig("model_comparison.png")
    plt.close()

# Generate report on mathematical model and approach
def generate_model_report():
    report = """
    # Mathematical Model Report: Local-Global Feature Architecture for MNIST Classification

    ## 1. Introduction
    This report details a novel neural network architecture designed to leverage both local and global features
    in image data for improved classification of handwritten digits from the MNIST dataset.

    ## 2. Mathematical Formulation

    ### 2.1 Local Feature Extraction

    Given an input image X ∈ ℝ^(28×28), we extract N patches {p_1, p_2, ..., p_N} where each
    p_i ∈ ℝ^(k×k) represents a local region of the image (k = patch size).

    For each patch p_i, we compute a local feature representation:

    f_i = φ(W_2 · ReLU(W_1 · flatten(p_i) + b_1) + b_2)

    where:
    - flatten(p_i) ∈ ℝ^(k²) is the flattened patch
    - W_1 ∈ ℝ^(64×k²), b_1 ∈ ℝ^64 are parameters of the first dense layer
    - W_2 ∈ ℝ^(d_local×64), b_2 ∈ ℝ^d_local are parameters of the second dense layer
    - φ represents the layer normalization operation
    - d_local is the local feature dimension

    This produces a set of local feature vectors F = {f_1, f_2, ..., f_N} where each f_i ∈ ℝ^d_local.

    ### 2.2 Global Feature Aggregation

    To capture relationships between local features and learn global structure, we employ a self-attention mechanism:

    A = MultiHeadAttention(F, F)
    F' = LayerNorm(F + A)

    The multi-head attention operation can be expressed as:

    MultiHeadAttention(Q, K, V) = Concat(head_1, ..., head_h)W^O

    where each head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)

    The attention function is:

    Attention(Q, K, V) = softmax((QK^T)/√d_k)V

    After applying attention, we further process the attended features:

    G = LayerNorm(W_G · F' + b_G)

    where W_G ∈ ℝ^(d_global×d_local), b_G ∈ ℝ^d_global are learnable parameters, and d_global is the global feature dimension.

    Finally, we aggregate the global features into a single representation:

    g = 1/N · ∑_i=1^N G_i

    where g ∈ ℝ^d_global is the final global representation.

    ### 2.3 Classification

    The global representation g is then passed through a final classification network:

    h = ReLU(W_h · g + b_h)
    h' = Dropout(h)
    h'' = LayerNorm(h')
    y = softmax(W_y · h'' + b_y)

    where:
    - W_h ∈ ℝ^(d_hidden×d_global), b_h ∈ ℝ^d_hidden are parameters of the hidden layer
    - W_y ∈ ℝ^(10×d_hidden), b_y ∈ ℝ^10 are parameters of the output layer
    - y ∈ ℝ^10 is the final class probability distribution

    ## 3. Model Training

    The model is trained using categorical cross-entropy loss:

    L(y, ŷ) = -∑_c y_c log(ŷ_c)

    where y is the one-hot encoded true label and ŷ is the predicted probability distribution.

    The optimization is performed using Adam with an initial learning rate of 0.001,
    which is adaptively reduced when validation loss plateaus.
    """

    with open("model_report.md", "w") as f:
        f.write(report)

    return report

# Main function to run the experiment
def main():
    # Create results directory
    os.makedirs("results", exist_ok=True)

    # Load and preprocess data
    (x_train, y_train), (x_test, y_test) = load_and_preprocess_data()

    # Create and train the baseline model
    print("\n=== Training Baseline Model ===")
    baseline_model = create_baseline_model()
    baseline_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    baseline_start_time = time.time()
    baseline_history = baseline_model.fit(
        x_train, y_train,
        batch_size=config.BATCH_SIZE,
        epochs=config.EPOCHS,
        validation_split=config.VALIDATION_SPLIT,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=5,
                restore_best_weights=True
            )
        ],
        verbose=1
    )
    baseline_training_time = time.time() - baseline_start_time

    baseline_test_start_time = time.time()
    baseline_loss, baseline_accuracy = baseline_model.evaluate(x_test, y_test, verbose=0)
    baseline_test_time = time.time() - baseline_test_start_time

    print(f"Baseline Test Accuracy: {baseline_accuracy:.4f}")
    print(f"Baseline Training Time: {baseline_training_time:.2f} seconds")
    print(f"Baseline Inference Time: {baseline_test_time:.2f} seconds")

    # Analyze baseline results
    print("\n=== Analyzing Baseline Results ===")
    baseline_results = analyze_results(baseline_model, x_test, y_test)
    baseline_results['training_time'] = baseline_training_time
    baseline_results['test_time'] = baseline_test_time

    # Plot baseline training history
    plot_training_history(baseline_history)

    # Create and train the custom model
    print("\n=== Training Local-Global Feature Model ===")
    custom_model = LocalGlobalDigitClassifier(config)

    # Visualize some patches before training
    visualize_patches(custom_model, x_train[:10])

    # Train the custom model
    custom_history, custom_accuracy, custom_training_time, custom_test_time = train_model(
        custom_model, x_train, y_train, x_test, y_test, config
    )

    # Analyze custom model results
    print("\n=== Analyzing Local-Global Model Results ===")
    custom_results = analyze_results(custom_model, x_test, y_test)
    custom_results['training_time'] = custom_training_time
    custom_results['test_time'] = custom_test_time

    # Plot custom model training history
    plot_training_history(custom_history)

    # Compare models
    print("\n=== Comparing Models ===")
    compare_models(custom_results, baseline_results)

    # Generate mathematical model report
    print("\n=== Generating Mathematical Model Report ===")
    generate_model_report()

    print("\nExperiment complete. Results and visualizations saved to disk.")

if __name__ == "__main__":
    main()

Loading MNIST dataset...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Training data shape: (60000, 28, 28, 1)
Training labels shape: (60000, 10)
Test data shape: (10000, 28, 28, 1)
Test labels shape: (10000, 10)

=== Training Baseline Model ===


  super().__init__(**kwargs)


Epoch 1/20
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.7814 - loss: 0.7056 - val_accuracy: 0.9632 - val_loss: 0.1253
Epoch 2/20
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.9413 - loss: 0.1924 - val_accuracy: 0.9728 - val_loss: 0.0920
Epoch 3/20
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.9577 - loss: 0.1376 - val_accuracy: 0.9773 - val_loss: 0.0760
Epoch 4/20
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.9667 - loss: 0.1080 - val_accuracy: 0.9773 - val_loss: 0.0752
Epoch 5/20
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.9721 - loss: 0.0926 - val_accuracy: 0.9785 - val_loss: 0.0717
Epoch 6/20
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.9748 - loss: 0.0821 - val_accuracy: 0.9795 - val_loss: 0.0736
Epoch 7/20
[1m422/422