Pytorch Tensor

Part 1 — Thinking in Tensors Tensors are PyTorch’s core data structure (like NumPy arrays, but with GPU & autograd support).

Creating tensors

import torch

# Scalars
a = torch.tensor(3.14)    # default dtype: float32
a*a

Output:

import torch

# Scalars
b = torch.tensor(5, dtype=torch.int32)   # integer tensor
b

# 1D vector
v = torch.tensor([1, 2, 3])

# 2D matrix
m = torch.tensor([[1., 2.], [3., 4.]])

torch.tensor([[1, 2], [3, 4]])     # dtype: torch.int64
torch.tensor([[1., 2.], [3., 4.]]) # dtype: torch.float32

m = torch.tensor([[1., 2.], [3., 4.]])

m = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)

# Random tensors
r = torch.rand(2, 3)                     # uniform [0,1)
n = torch.randn(2, 3)                    # normal dist.

# Shape & dtype
print(r.shape, r.dtype)

Broadcasting

x = torch.ones(3, 1)    # shape (3,1)
y = torch.ones(1, 4)    # shape (1,4)
z = x + y               # shape (3,4) after broadcasting
print(z)

t = torch.arange(6).reshape(2, 3)
view_t = t.view(3, 2)        # reshapes without copying
copy_t = t.clone().reshape(3, 2)  # makes a copy

<img width="874" height="518" alt="image" src="https://github.com/user-attachments/assets/20a2f622-3d16-4712-8cbc-bab0598aefe1" />

t = torch.arange(6)
t

Output

t = torch.arange(6).reshape(2,3)
t

Outut:

view_t = t.view(3, 2)  # reshapes without copying
t

view_t[0, 0] = 99
print(t)   # the original tensor `t` will now also have 99 in the same memory spot

copy_t = t.clone().reshape(3, 2)  # makes a copy
copy_t

copy_t[0, 0] = 77
print(t)   # original stays unchanged

Part 2 — Autograd Basics

PyTorch tracks operations on tensors with requires_grad=True.

.requires_grad

x = torch.tensor(2.0, requires_grad=True)
y = x**2 + 3*x + 1
y.backward()   # dy/dx computed automatically
print(x.grad)  # derivative at x=2 → 2*x + 3 = 7

Output

with torch.no_grad():
    y = x**2  # no gradient history is recorded

Output

Student: what is gradient accumulation?

Teacher C:

import torch

x = torch.tensor(2.0, requires_grad=True)

for _ in range(2):
    y = x * 3
    y.backward()
    print("Grad after backward:", x.grad)

optimizer.zero_grad()  # Preferred when using optimizers
# or
x.grad.zero_()         # For manual param updates

accum_steps = 4
optimizer.zero_grad()

for i, (inputs, targets) in enumerate(data_loader):
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()  # accumulate grads

    if (i + 1) % accum_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

Student: just to confirm gradient is percentage change in y for 1% change in X--right?

Teacher C:

--- Student: explain gradient clipping ---what is procedure to prevent exploding gradients. Explain torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)--word by word

Teacher C:

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

for batch in data_loader:
    optimizer.zero_grad()
    outputs = model(batch_inputs)
    loss = criterion(outputs, batch_labels)
    loss.backward()

    # Clip gradients BEFORE the optimiser step
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Data
X = torch.rand(100, 1)
y = 3 * X + 2 + 0.1 * torch.randn(100, 1)  # add small noise

# Params (start random, require grads)
w = torch.randn(1, requires_grad=True)
b = torch.randn(1, requires_grad=True)

lr = 0.1

for epoch in range(100):
    # Forward pass
    y_pred = w * X + b
    loss = torch.mean((y_pred - y)**2)

    # Backward pass
    loss.backward()

    # Update
    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad

    # Zero grads
    w.grad.zero_()
    b.grad.zero_()

print(f"Learned w: {w.item():.3f}, b: {b.item():.3f}")

Student: b = torch.randn(1, requires_grad=True)--explain this word-by-word. b is constant --why is measuring gradient with it

Teacher C:

b = torch.randn(1, requires_grad=True)

Student: can you explain # Zero grads w.grad.zero_() b.grad.zero_()---what these two lines doing

loss.backward()

# Zero grads
w.grad.zero_()
b.grad.zero_()

w.grad.zero_()

# Suppose gradient from batch 1 is 0.3
# Gradient from batch 2 is 0.2

First backward: w.grad = 0.3
Second backward: w.grad = 0.5  # 0.3 + 0.2 (accumulated!)

optimizer = torch.optim.SGD([w, b], lr=0.1)

Logistic Regression (LR)

# Dummy binary data
X = torch.randn(100, 2)
y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1)  # label=1 if sum>0

w = torch.randn(2, 1, requires_grad=True)
b = torch.randn(1, requires_grad=True)
lr = 0.1

def sigmoid(z):
    return 1 / (1 + torch.exp(-z))

for epoch in range(200):
    # Forward
    z = X @ w + b
    y_pred = sigmoid(z)
    loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))

    # Backward
    loss.backward()

    # Update
    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad

    w.grad.zero_()
    b.grad.zero_()

print("Final loss:", loss.item())

Explain

y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1)  # label=1 if sum>0

import torch

X = torch.tensor([
    [ 2.0,  1.0],   # sum =  3.0 (>0) → 1.0
    [-1.0,  0.5],   # sum = -0.5 (<=0) → 0.0
    [-2.0, -3.0],   # sum = -5.0 (<=0) → 0.0
    [ 0.5,  0.5]    # sum =  1.0 (>0) → 1.0
])

y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1)
print(y)

y = torch.where(X[:, 0] + X[:, 1] > 0, 1.0, -1.0).view(-1, 1)

Student: view(-1, 1) → reshapes from shape (N,) to (N, 1) but does not turn zeros into -1. is this shape (N,)--a vector and (N, 1) is this a matrix--what is the difference

Teacher C:

v = torch.tensor([1., 2., 3.])
print(v.shape)  # torch.Size([3])

m = torch.tensor([[1.], [2.], [3.]])
print(m.shape)  # torch.Size([3, 1])

Student: z = X @ w + b---what is @ here

Teacher C:

z = X @ w + b

import torch

X = torch.tensor([[1., 2.],
                  [3., 4.],
                  [5., 6.]])   # shape (3, 2)
w = torch.tensor([[0.5],
                  [1.0]])      # shape (2, 1)
b = torch.tensor([0.1])        # shape (1,)

z = X @ w + b
print(z)

Student: loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))--what is 1e-8 here

Teacher C:

loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))

torch.nn.BCELoss()
# or
torch.nn.BCEWithLogitsLoss()

Home

Pytorch Tensor

Creating tensors

Output:

Broadcasting

Output

Outut:

Part 2 — Autograd Basics

Output

Output

Logistic Regression (LR)

Explain

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!