In [11]:
%%writefile tpu_training.py

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, Model
import os
import json
from datetime import datetime

class TPUTrainingPipeline:
    def __init__(self, num_classes=5):
        self.num_classes = num_classes
        self.history = {}

        # Initialize TPU strategy
        try:
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            self.strategy = tf.distribute.TPUStrategy(resolver)
            print(f"Training on TPU: {resolver.cluster_spec().as_dict()}")
        except ValueError:
            print("No TPU detected. Using CPU/GPU strategy")
            self.strategy = tf.distribute.MirroredStrategy()

        print(f'Number of replicas: {self.strategy.num_replicas_in_sync}')

        # TPU-optimized batch size
        self.batch_size = 128 * self.strategy.num_replicas_in_sync
        self.image_size = (224, 224)

    def create_model(self):
        """Create model within TPU strategy scope"""
        with self.strategy.scope():
            base_model = MobileNetV2(
                weights='imagenet',
                include_top=False,
                input_shape=(*self.image_size, 3)
            )

            base_model.trainable = False

            x = layers.GlobalAveragePooling2D()(base_model.output)

            # TPU-optimized attention mechanism
            attention = layers.Dense(512, activation='relu')(x)
            attention = layers.Dense(x.shape[-1], activation='sigmoid')(attention)
            attention_output = layers.Multiply()([x, attention])

            # TPU-friendly layer sizes (powers of two)
            x = layers.Dense(512, activation='relu')(attention_output)
            x = layers.BatchNormalization()(x)
            x = layers.Dense(256, activation='relu')(x)
            x = layers.BatchNormalization()(x)
            outputs = layers.Dense(self.num_classes, activation='softmax')(x)

            model = Model(base_model.input, outputs)

            optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

            model.compile(
                optimizer=optimizer,
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy']
            )

            return model

    def create_dataset(self, dataset, is_training=True):
        """Create TPU-optimized dataset pipeline"""
        AUTOTUNE = tf.data.AUTOTUNE

        @tf.function
        def preprocess(image, label):
            image = tf.cast(image, tf.float32)
            image = tf.image.resize(image, self.image_size)
            image = tf.keras.applications.mobilenet_v2.preprocess_input(image)
            return image, label

        @tf.function
        def augment(image, label):
            image = tf.image.random_flip_left_right(image)
            image = tf.image.random_brightness(image, 0.2)
            return image, label

        dataset = dataset.map(preprocess, num_parallel_calls=AUTOTUNE)

        if is_training:
            dataset = dataset.map(augment, num_parallel_calls=AUTOTUNE)
            dataset = dataset.shuffle(10000)

        dataset = dataset.batch(self.batch_size, drop_remainder=True)
        dataset = dataset.prefetch(AUTOTUNE)
        return dataset

    def train(self, train_dataset, val_dataset, epochs=10):
        """Train the model using TPU-optimized settings"""
        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                restore_best_weights=True
            ),
            tf.keras.callbacks.ModelCheckpoint(
                'best_model.h5',
                monitor='val_accuracy',
                save_best_only=True,
                mode='max'
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.2,
                patience=2,
                min_lr=0.0001
            )
        ]

        model = self.create_model()
        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=epochs,
            callbacks=callbacks
        )

        # Return results in same format as base pipeline for fair comparison
        return {
            'accuracy': [float(x) for x in history.history['accuracy']],
            'val_accuracy': [float(x) for x in history.history['val_accuracy']],
            'loss': [float(x) for x in history.history['loss']],
            'val_loss': [float(x) for x in history.history['val_loss']]
        }

Writing tpu_training.py


In [12]:
%%writefile baseline_training.py

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, Model
import multiprocessing
import os

class BaseTrainingPipeline:
    def __init__(self, num_classes=5):
        self.num_classes = num_classes
        self.history = {}
        self.model = None
        self.train_dataset = None
        self.val_dataset = None
        self.quantized_model = None
        # Add batch size as class attribute for benchmarking
        self.batch_size = 32
        self.image_size = (224, 224)

    def create_model(self):  # Remove num_classes parameter as it's a class attribute
        """Create a fine-tunable MobileNetV2 model"""
        base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(*self.image_size, 3))

        # Freeze the base model
        base_model.trainable = False

        # Add custom classification head
        x = layers.GlobalAveragePooling2D()(base_model.output)
        x = layers.Dense(128, activation='relu')(x)
        x = layers.Dropout(0.2)(x)
        outputs = layers.Dense(self.num_classes, activation='softmax')(x)

        model = Model(base_model.input, outputs)

        # Compile model here to match benchmark requirements
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model  # Return model instead of setting self.model

    def preprocess_data(self, image, label):
        """Preprocess images for MobileNetV2"""
        image = tf.cast(image, tf.float32)
        image = tf.image.resize(image, self.image_size)
        image = tf.keras.applications.mobilenet_v2.preprocess_input(image)
        return image, label

    def create_dataset(self, dataset, is_training=True):  # Add is_training parameter to match TPU pipeline
        """Create an optimized tf.data pipeline"""
        AUTOTUNE = tf.data.AUTOTUNE

        dataset = dataset.map(self.preprocess_data, num_parallel_calls=AUTOTUNE)

        if is_training:
            dataset = dataset.cache().shuffle(1000)

        return dataset.batch(self.batch_size).prefetch(AUTOTUNE)

    def train(self, train_dataset, val_dataset, epochs=5):  # Modified to match benchmark interface
        """Train the model using an efficient training loop"""
        model = self.create_model()

        # Use multiple workers for training
        train_workers = min(multiprocessing.cpu_count(), 4)

        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=epochs,
            workers=train_workers,
            use_multiprocessing=True
        )

        # Return results in the same format as TPU pipeline
        return {
            'accuracy': [float(x) for x in history.history['accuracy']],
            'val_accuracy': [float(x) for x in history.history['val_accuracy']],
            'loss': [float(x) for x in history.history['loss']],
            'val_loss': [float(x) for x in history.history['val_loss']]
        }

    def quantize_model(self, model, train_dataset):  # Add train_dataset parameter
        """Convert model to TFLite format with quantization"""
        converter = tf.lite.TFLiteConverter.from_keras_model(model)

        # Enable quantization
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
        converter.inference_input_type = tf.uint8
        converter.inference_output_type = tf.uint8

        # Representative dataset for quantization
        def representative_dataset():
            for data in train_dataset.take(100):
                yield [tf.dtypes.cast(data[0], tf.float32)]

        converter.representative_dataset = representative_dataset

        # Convert the model
        quantized_tflite_model = converter.convert()

        # Save the quantized model
        with open('quantized_model.tflite', 'wb') as f:
            f.write(quantized_tflite_model)

        return quantized_tflite_model  # Return instead of setting self.quantized_model

Writing baseline_training.py


In [1]:
import tensorflow as tf
import time
import json
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path
import tensorflow_datasets as tfds

class PerformanceBenchmark:
    def __init__(self, base_pipeline, tpu_pipeline):
        self.base_pipeline = base_pipeline
        self.tpu_pipeline = tpu_pipeline
        self.results = {}

    def benchmark_training_step(self, model, dataset, num_steps=100):
        """Measure average training step time"""
        step_times = []

        for batch in dataset.take(num_steps):
            start_time = time.perf_counter()
            _ = model.train_on_batch(*batch)
            step_times.append(time.perf_counter() - start_time)

        return {
            'avg_step_time': np.mean(step_times),
            'std_step_time': np.std(step_times),
            'min_step_time': np.min(step_times),
            'max_step_time': np.max(step_times)
        }

    def measure_throughput(self, model, dataset, pipeline, num_steps=100):
        """Calculate images processed per second"""
        batch_size = pipeline.batch_size
        total_images = batch_size * num_steps

        start_time = time.perf_counter()
        for batch in dataset.take(num_steps):
            _ = model.predict_on_batch(batch[0])
        total_time = time.perf_counter() - start_time

        return {
            'images_per_second': total_images / total_time,
            'batch_size': batch_size,
            'total_time': total_time
        }

    def _benchmark_pipeline(self, pipeline, train_ds, val_ds, epochs, name):
        """Run benchmark for a single pipeline"""
        # Prepare datasets
        train_dataset = pipeline.create_dataset(train_ds, is_training=True)
        val_dataset = pipeline.create_dataset(val_ds, is_training=False)

        # Create model
        model = pipeline.create_model()

        # Measure throughput
        throughput_metrics = self.measure_throughput(model, val_dataset, pipeline)

        # Measure Step metrics
        step_metrics = self.benchmark_training_step(model, train_dataset)

        # Train model and measure time
        start_time = time.perf_counter()
        history = pipeline.train(train_dataset, val_dataset, epochs)
        training_time = time.perf_counter() - start_time

        return {
            'name': name,
            'throughput_metrics': throughput_metrics,
            'training_metrics': {
                'total_time': training_time,
                'epochs': epochs,
                'history': history
            },
            'model_metrics': {
                'total_params': model.count_params(),
                'batch_size': pipeline.batch_size
            },
            'step_metrics': step_metrics
        }

    def run_full_benchmark(self, train_dataset, val_dataset, epochs=2):
        """Run comprehensive benchmark comparing both pipelines"""
        results = {
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'hardware_info': self._get_hardware_info(),
            'pipelines': {}
        }

        # Benchmark base pipeline
        print("Benchmarking base pipeline...")
        base_metrics = self._benchmark_pipeline(
            self.base_pipeline, train_dataset, val_dataset,
            epochs, "Base Pipeline"
        )
        results['pipelines']['base'] = base_metrics

        # Benchmark TPU pipeline
        print("\nBenchmarking TPU-optimized pipeline...")
        tpu_metrics = self._benchmark_pipeline(
            self.tpu_pipeline, train_dataset, val_dataset,
            epochs, "TPU Pipeline"
        )
        results['pipelines']['tpu'] = tpu_metrics

        # Save results
        self._save_results(results)

        # Generate visualizations
        self._generate_visualizations(results)

        return results

    def _get_hardware_info(self):
        """Gather information about available hardware"""
        gpu_devices = tf.config.list_physical_devices('GPU')
        tpu_devices = tf.config.list_physical_devices('TPU')

        return {
            'num_gpus': len(gpu_devices),
            'num_tpus': len(tpu_devices),
            'tpu_type': self._get_tpu_type() if tpu_devices else None,
            'tensorflow_version': tf.__version__
        }

    def _get_tpu_type(self):
        """Get TPU type if available"""
        try:
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
            return str(resolver.cluster_spec())
        except:
            return None

    def _save_results(self, results):
        """Save benchmark results to file"""
        Path('benchmark_results').mkdir(exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        with open(f'benchmark_results/benchmark_{timestamp}.json', 'w') as f:
            json.dump(results, f, indent=4)

    def _generate_visualizations(self, results):
        """Generate performance comparison visualizations"""
        Path('benchmark_plots').mkdir(exist_ok=True)

        # Training time comparison
        self._plot_training_comparison(results)

        # Throughput comparison
        self._plot_throughput_comparison(results)

        # Training metrics over time
        self._plot_training_metrics(results)

    def _plot_training_comparison(self, results):
        """Plot training time comparisons"""
        plt.figure(figsize=(10, 6))

        pipelines = list(results['pipelines'].keys())
        times = [results['pipelines'][p]['training_metrics']['total_time']
                for p in pipelines]

        plt.bar(pipelines, times)
        plt.title('Total Training Time Comparison')
        plt.ylabel('Time (seconds)')
        plt.savefig('benchmark_plots/training_time_comparison.png')
        plt.close()

    def _plot_throughput_comparison(self, results):
        """Plot throughput comparisons"""
        plt.figure(figsize=(10, 6))

        pipelines = list(results['pipelines'].keys())
        throughputs = [results['pipelines'][p]['throughput_metrics']['images_per_second']
                      for p in pipelines]

        plt.bar(pipelines, throughputs)
        plt.title('Image Processing Throughput Comparison')
        plt.ylabel('Images per Second')
        plt.savefig('benchmark_plots/throughput_comparison.png')
        plt.close()

    def _plot_training_metrics(self, results):
        """Plot training metrics over time"""
        plt.figure(figsize=(12, 8))

        for pipeline_name, pipeline_data in results['pipelines'].items():
            history = pipeline_data['training_metrics']['history']

            plt.plot(history['val_accuracy'],
                    label=f'{pipeline_name} Validation Accuracy')

        plt.title('Training Progress Comparison')
        plt.xlabel('Epoch')
        plt.ylabel('Validation Accuracy')
        plt.legend()
        plt.savefig('benchmark_plots/training_progress.png')
        plt.close()

In [2]:
from baseline_training import BaseTrainingPipeline
from tpu_training import TPUTrainingPipeline
import tensorflow_datasets as tfds

# Load dataset
(train_ds, val_ds), dataset_info = tfds.load(
    'tf_flowers',
    split=['train[:80%]', 'train[20%:]'],
    with_info=True,
    as_supervised=True,
)

# Initialize pipelines
base_pipeline = BaseTrainingPipeline(num_classes=5)
tpu_pipeline = TPUTrainingPipeline(num_classes=5)

# Initialize and run benchmark
benchmark = PerformanceBenchmark(base_pipeline, tpu_pipeline)
results = benchmark.run_full_benchmark(train_ds, val_ds, epochs=2)

# Print summary
print("\nBenchmark Summary:")
print("-" * 50)
for pipeline_name, pipeline_data in results['pipelines'].items():
    print(f"\n{pipeline_name} Pipeline:")
    print(f"Total training time: {pipeline_data['training_metrics']['total_time']:.2f} seconds")
    print(f"Images per second: {pipeline_data['throughput_metrics']['images_per_second']:.2f}")
    print(f"Total Model Parameters: {pipeline_data['model_metrics']['total_params']}")
    print(f"Batch Size: {pipeline_data['model_metrics']['batch_size']}")
    print(f"Average step time: {pipeline_data['step_metrics']['avg_step_time'] * 1000:.2f} ms")

Training on TPU: {}
Number of replicas: 8
Benchmarking base pipeline...
Epoch 1/2
Epoch 2/2

Benchmarking TPU-optimized pipeline...
Epoch 1/2

  saving_api.save_model(


Epoch 2/2

Benchmark Summary:
--------------------------------------------------

base Pipeline:
Total training time: 76.22 seconds
Images per second: 142.42
Total Model Parameters: 2422597
Batch Size: 32
Average step time: 264.41 ms

tpu Pipeline:
Total training time: 39.69 seconds
Images per second: 11337.29
Total Model Parameters: 4362053
Batch Size: 1024
Average step time: 7081.33 ms


In [9]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def load_and_analyze_results(file_path):
    """Load and analyze benchmark results"""
    with open(file_path, 'r') as f:
        results = json.load(f)

    # Create analysis sections
    print("=" * 50)
    print("Edge TPU Performance Analysis")
    print("=" * 50)

    # 1. Basic Performance Metrics
    print("\n1. Basic Performance Metrics:")
    print("-" * 30)
    metrics_comparison = pd.DataFrame({
        'Metric': ['First Inference Time (ms)', 'Average Inference Time (ms)', 'Inference Time Std (ms)', 'Throughput (FPS)'],
        'With Warmup': [
            results['warmup']['detailed_performance_metrics']['inference_times'][0],
            results['warmup']['summary']['average_inference_time_ms'],
            results['warmup']['summary']['inference_time_std_ms'],
            results['warmup']['summary']['throughput_fps']
        ],
        'Without Warmup': [
            results['warmup']['detailed_performance_metrics']['inference_times'][0],
            results['no_warmup']['summary']['average_inference_time_ms'],
            results['no_warmup']['summary']['inference_time_std_ms'],
            results['no_warmup']['summary']['throughput_fps']
        ]
    })
    print(metrics_comparison.to_string(index=False))

    # 2. Detailed Timing Analysis
    print("\n2. Detailed Timing Analysis:")
    print("-" * 30)

    def analyze_timing_component(name, warm_times, no_warm_times):
        warm_stats = {
            'mean': np.mean(warm_times),
            'std': np.std(warm_times),
            'p95': np.percentile(warm_times, 95),
            'p99': np.percentile(warm_times, 99)
        }
        no_warm_stats = {
            'mean': np.mean(no_warm_times),
            'std': np.std(no_warm_times),
            'p95': np.percentile(no_warm_times, 95),
            'p99': np.percentile(no_warm_times, 99)
        }
        return pd.DataFrame({
            'Metric': [f'{name} Mean (ms)', f'{name} Std (ms)', f'{name} P95 (ms)', f'{name} P99 (ms)'],
            'With Warmup': [warm_stats['mean'], warm_stats['std'], warm_stats['p95'], warm_stats['p99']],
            'Without Warmup': [no_warm_stats['mean'], no_warm_stats['std'], no_warm_stats['p95'], no_warm_stats['p99']]
        })

    timing_components = ['preprocessing_times', 'inference_times', 'invoke_times', 'overhead_times']
    all_timing_stats = pd.DataFrame()

    for component in timing_components:
      if component in results['warmup']['detailed_performance_metrics'] and component in results['no_warmup']['detailed_performance_metrics']:
        warm_times = results['warmup']['detailed_performance_metrics'][component]
        no_warm_times = results['no_warmup']['detailed_performance_metrics'][component]
        component_stats = analyze_timing_component(component.replace('_times', ''), warm_times, no_warm_times)
        all_timing_stats = pd.concat([all_timing_stats, component_stats])

    print(all_timing_stats.to_string(index=False))

    # 3. Performance Stability Analysis
    print("\n3. Performance Stability Analysis:")
    print("-" * 30)

    def analyze_stability(times):
        return {
            'coefficient_of_variation': np.std(times) / np.mean(times),
            'range': np.max(times) - np.min(times),
            'iqr': np.percentile(times, 75) - np.percentile(times, 25)
        }

    warm_stability = analyze_stability(results['warmup']['detailed_performance_metrics']['inference_times'])
    no_warm_stability = analyze_stability(results['no_warmup']['detailed_performance_metrics']['inference_times'])

    stability_df = pd.DataFrame({
        'Metric': ['Coefficient of Variation', 'Range (ms)', 'IQR (ms)'],
        'With Warmup': [
            warm_stability['coefficient_of_variation'],
            warm_stability['range'],
            warm_stability['iqr']
        ],
        'Without Warmup': [
            no_warm_stability['coefficient_of_variation'],
            no_warm_stability['range'],
            no_warm_stability['iqr']
        ]
    })
    print(stability_df.to_string(index=False))

    # 4. Visualization
    plot_dir = Path('analysis_plots')
    plot_dir.mkdir(exist_ok=True)

    # Inference Time Distribution
    plt.figure(figsize=(12, 6))
    plt.hist(results['warmup']['detailed_performance_metrics']['inference_times'],
             bins=50, alpha=0.5, label='With Warmup')
    plt.hist(results['no_warmup']['detailed_performance_metrics']['inference_times'],
             bins=50, alpha=0.5, label='Without Warmup')
    plt.title('Inference Time Distribution')
    plt.xlabel('Time (ms)')
    plt.ylabel('Frequency')
    plt.legend()
    plt.savefig(plot_dir / 'inference_time_distribution.png')
    plt.close()

    # Time Series Plot
    plt.figure(figsize=(15, 6))
    plt.plot(results['warmup']['detailed_performance_metrics']['inference_times'][:100],
             label='With Warmup', alpha=0.7)
    plt.plot(results['no_warmup']['detailed_performance_metrics']['inference_times'][:100],
             label='Without Warmup', alpha=0.7)
    plt.title('First 100 Inference Times')
    plt.xlabel('Inference Number')
    plt.ylabel('Time (ms)')
    plt.legend()
    plt.savefig(plot_dir / 'inference_time_series.png')
    plt.close()

    # Component Breakdown
    component_means = {
        # 'Preprocessing': (np.mean(results['warmup']['detailed_performance_metrics']['preprocessing_times']),
        #                 np.mean(results['no_warmup']['detailed_performance_metrics']['preprocessing_times'])),
        'Invoke': (np.mean(results['warmup']['detailed_performance_metrics']['invoke_times']),
                  np.mean(results['no_warmup']['detailed_performance_metrics']['invoke_times'])),
        'Overhead': (np.mean(results['warmup']['detailed_performance_metrics']['overhead_times']),
                    np.mean(results['no_warmup']['detailed_performance_metrics']['overhead_times']))
    }

    plt.figure(figsize=(10, 6))
    x = np.arange(len(component_means))
    width = 0.35

    plt.bar(x - width/2, [v[0] for v in component_means.values()], width, label='With Warmup')
    plt.bar(x + width/2, [v[1] for v in component_means.values()], width, label='Without Warmup')
    plt.title('Average Time Components')
    plt.xlabel('Component')
    plt.ylabel('Time (ms)')
    plt.xticks(x, component_means.keys())
    plt.legend()
    plt.savefig(plot_dir / 'time_components.png')
    plt.close()

    return results

In [10]:
# Run analysis
results = load_and_analyze_results('edge_tpu_inference.json')

Edge TPU Performance Analysis

1. Basic Performance Metrics:
------------------------------
                     Metric  With Warmup  Without Warmup
  First Inference Time (ms)     3.296054        3.296054
Average Inference Time (ms)     3.660090        3.674524
    Inference Time Std (ms)     0.317419        0.447161
           Throughput (FPS)   273.217293      272.144074

2. Detailed Timing Analysis:
------------------------------
             Metric  With Warmup  Without Warmup
inference Mean (ms)     3.660090        3.674524
 inference Std (ms)     0.317419        0.447161
 inference P95 (ms)     4.202994        4.247833
 inference P99 (ms)     4.308867        4.317633
   invoke Mean (ms)     3.046791        3.070708
    invoke Std (ms)     0.293177        0.412730
    invoke P95 (ms)     3.476361        3.661354
    invoke P99 (ms)     3.686069        3.701945
 overhead Mean (ms)     0.613299        0.603816
  overhead Std (ms)     0.098051        0.108635
  overhead P95 (ms)    