# Thinking in Tensors and Graphs
- Explain why Python loops slow down ML workloads.
- Rewrite naïve tensor operations using vectorization.
- Use TensorFlow-native ops instead of NumPy inside pipelines.
- Understand how @tf.function compiles Python into optimized graphs.
- Recognize performance patterns used in production ML systems.

In [18]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)

gpus = tf.config.list_physical_devices("GPU")
print("GPUs detected:", gpus)


TensorFlow version: 2.9.1
GPUs detected: []


In [19]:
# Vectorization Example
# Vectorized tensor operations are dramatically faster than Python loops because they execute 
# in optimized C++ kernels and can leverage parallel hardware.

import numpy as np
import time

x = tf.random.uniform((10000, 100))

# Python loop (slow)
start = time.time()
result = []
for i in range(10000):
    result.append(tf.reduce_sum(x[i]))
loop_time = time.time() - start

# Vectorized (fast)
start = time.time()
result2 = tf.reduce_sum(x, axis=1)
vector_time = time.time() - start

print("Loop time:", loop_time)
print("Vectorized time:", vector_time)


Loop time: 2.1757538318634033
Vectorized time: 0.0


In [20]:
# Simulated Image Batch

BATCH_SIZE = 64
IMG_SIZE = 128

images = tf.random.uniform((BATCH_SIZE, IMG_SIZE, IMG_SIZE, 3))

# Naïve Python Loop (Bad Practice)

def slow_center(images):
    output = []
    for img in images:
        mean = tf.reduce_mean(img)
        output.append(img - mean)
    return tf.stack(output)

start = time.time()
_ = slow_center(images)
slow_time = time.time() - start

print("Slow version time:", slow_time)

# Vectorized Version (Good Practice)

def fast_center(images):
    mean = tf.reduce_mean(images, axis=(1,2,3), keepdims=True)
    return images - mean

start = time.time()
_ = fast_center(images)
fast_time = time.time() - start

print("Fast version time:", fast_time)


Slow version time: 0.02286243438720703
Fast version time: 0.0030002593994140625


In [23]:
# TensorFlow Ops vs NumPy Ops
# Mixing NumPy inside a TensorFlow pipeline forces execution back to Python.

# NumPy in tf.data (Bad Pattern)

def numpy_preprocess(image, label):
    image = image.numpy() / 255.0
    return image, label

# TensorFlow-Native Version (Good Pattern)

def tf_preprocess(image, label):
    image = tf.cast(image, tf.float32) / 255.0
    return image, label


In [24]:
# tf.function Speedup
# TensorFlow can convert Python functions into optimized computation graphs for faster execution

@tf.function
def compute(x):
    return tf.reduce_sum(tf.square(x))

x = tf.random.uniform((1000, 1000))

%timeit compute(x)

544 μs ± 7.01 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [25]:
# From Eager to Graph Execution (@tf.function)
# TensorFlow 2 runs eagerly by default.
# @tf.function: 
# --> Traces the function
# --> Builds computation graph
# --> Optimizes execution
# --> Removes Python overhead
# Production systems rely on graph execution.

# Eager Function

def compute_loss(x):
    return tf.reduce_sum(tf.square(x))

x = tf.random.uniform((1000, 1000))

start = time.time()
_ = compute_loss(x)
eager_time = time.time() - start

print("Eager execution time:", eager_time)

# Graph-Compiled Function

@tf.function
def compute_loss_graph(x):
    return tf.reduce_sum(tf.square(x))

# First call includes tracing cost
compute_loss_graph(x)

start = time.time()
_ = compute_loss_graph(x)
graph_time = time.time() - start

print("Graph execution time:", graph_time)


Eager execution time: 0.0010116100311279297
Graph execution time: 0.0


In [26]:
import tensorflow as tf
import time

# Simple dense model

model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dense(10)
])

optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.MeanSquaredError()

# Synthetic batch

x = tf.random.uniform((64, 100))
y = tf.random.uniform((64, 10))

# Eager Training Step

def train_step_eager(x, y):
    with tf.GradientTape() as tape:
        preds = model(x, training=True)
        loss = loss_fn(y, preds)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

# Graph-Compiled Training Step

@tf.function
def train_step_graph(x, y):
    with tf.GradientTape() as tape:
        preds = model(x, training=True)
        loss = loss_fn(y, preds)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

# Benchmark Function

def benchmark(func, x, y, n_iters=500):
    start = time.time()
    for _ in range(n_iters):
        func(x, y)
    return time.time() - start

# Warmup (Important for Fair Comparison) - first call includes tracing cost

train_step_graph(x, y)

# Run Benchmark

eager_time = benchmark(train_step_eager, x, y)
graph_time = benchmark(train_step_graph, x, y)

print(f"Eager training time (500 steps): {eager_time:.4f} sec")
print(f"Graph training time (500 steps): {graph_time:.4f} sec")


Eager training time (500 steps): 3.2288 sec
Graph training time (500 steps): 0.4553 sec


In [27]:
# Applying @tf.function to a Training Step

# Minimal Model

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(128,128,3)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(5, activation="softmax")
])

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

# Custom Training Step

@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images, training=True)
        loss = loss_fn(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


# TensorFlow Performance Checklist

When building ML pipelines:

✅ Batch operations

✅ Use tf.data

✅ Avoid Python loops over tensors

✅ Prefer TensorFlow ops over NumPy

✅ Use @tf.function for custom training steps

✅ Profile before optimizing

✅ Keep preprocessing on-device (not Python)