-
Notifications
You must be signed in to change notification settings - Fork 0
Pytorch Tensor
Part 1 — Thinking in Tensors Tensors are PyTorch’s core data structure (like NumPy arrays, but with GPU & autograd support).
import torch
# Scalars
a = torch.tensor(3.14) # default dtype: float32
a*a
import torch
# Scalars
b = torch.tensor(5, dtype=torch.int32) # integer tensor
b
# 1D vector
v = torch.tensor([1, 2, 3])
# 2D matrix
m = torch.tensor([[1., 2.], [3., 4.]])
torch.tensor([[1, 2], [3, 4]]) # dtype: torch.int64
torch.tensor([[1., 2.], [3., 4.]]) # dtype: torch.float32
m = torch.tensor([[1., 2.], [3., 4.]])
m = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)# Random tensors
r = torch.rand(2, 3) # uniform [0,1)
n = torch.randn(2, 3) # normal dist.
# Shape & dtype
print(r.shape, r.dtype)
x = torch.ones(3, 1) # shape (3,1)
y = torch.ones(1, 4) # shape (1,4)
z = x + y # shape (3,4) after broadcasting
print(z)
t = torch.arange(6).reshape(2, 3)
view_t = t.view(3, 2) # reshapes without copying
copy_t = t.clone().reshape(3, 2) # makes a copy
<img width="874" height="518" alt="image" src="https://github.com/user-attachments/assets/20a2f622-3d16-4712-8cbc-bab0598aefe1" />
t = torch.arange(6)
t
t = torch.arange(6).reshape(2,3)
t
view_t = t.view(3, 2) # reshapes without copying
t
view_t[0, 0] = 99
print(t) # the original tensor `t` will now also have 99 in the same memory spot
copy_t = t.clone().reshape(3, 2) # makes a copy
copy_t
copy_t[0, 0] = 77
print(t) # original stays unchanged
PyTorch tracks operations on tensors with requires_grad=True.
.requires_grad
x = torch.tensor(2.0, requires_grad=True)
y = x**2 + 3*x + 1
y.backward() # dy/dx computed automatically
print(x.grad) # derivative at x=2 → 2*x + 3 = 7
with torch.no_grad():
y = x**2 # no gradient history is recorded
Student: what is gradient accumulation?
Teacher C:

import torch
x = torch.tensor(2.0, requires_grad=True)
for _ in range(2):
y = x * 3
y.backward()
print("Grad after backward:", x.grad)
optimizer.zero_grad() # Preferred when using optimizers
# or
x.grad.zero_() # For manual param updates
accum_steps = 4
optimizer.zero_grad()
for i, (inputs, targets) in enumerate(data_loader):
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward() # accumulate grads
if (i + 1) % accum_steps == 0:
optimizer.step()
optimizer.zero_grad()
Student: just to confirm gradient is percentage change in y for 1% change in X--right?
Teacher C:
---
Student: explain gradient clipping ---what is procedure to prevent exploding gradients. Explain torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)--word by word
Teacher C:

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
for batch in data_loader:
optimizer.zero_grad()
outputs = model(batch_inputs)
loss = criterion(outputs, batch_labels)
loss.backward()
# Clip gradients BEFORE the optimiser step
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# Data
X = torch.rand(100, 1)
y = 3 * X + 2 + 0.1 * torch.randn(100, 1) # add small noise
# Params (start random, require grads)
w = torch.randn(1, requires_grad=True)
b = torch.randn(1, requires_grad=True)
lr = 0.1
for epoch in range(100):
# Forward pass
y_pred = w * X + b
loss = torch.mean((y_pred - y)**2)
# Backward pass
loss.backward()
# Update
with torch.no_grad():
w -= lr * w.grad
b -= lr * b.grad
# Zero grads
w.grad.zero_()
b.grad.zero_()
print(f"Learned w: {w.item():.3f}, b: {b.item():.3f}")Student: b = torch.randn(1, requires_grad=True)--explain this word-by-word. b is constant --why is measuring gradient with it
Teacher C:
b = torch.randn(1, requires_grad=True)
Student: can you explain # Zero grads w.grad.zero_() b.grad.zero_()---what these two lines doing
loss.backward()
# Zero grads
w.grad.zero_()
b.grad.zero_()
w.grad.zero_()
# Suppose gradient from batch 1 is 0.3
# Gradient from batch 2 is 0.2
First backward: w.grad = 0.3
Second backward: w.grad = 0.5 # 0.3 + 0.2 (accumulated!)
optimizer = torch.optim.SGD([w, b], lr=0.1)
# Dummy binary data
X = torch.randn(100, 2)
y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1) # label=1 if sum>0
w = torch.randn(2, 1, requires_grad=True)
b = torch.randn(1, requires_grad=True)
lr = 0.1
def sigmoid(z):
return 1 / (1 + torch.exp(-z))
for epoch in range(200):
# Forward
z = X @ w + b
y_pred = sigmoid(z)
loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))
# Backward
loss.backward()
# Update
with torch.no_grad():
w -= lr * w.grad
b -= lr * b.grad
w.grad.zero_()
b.grad.zero_()
print("Final loss:", loss.item())y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1) # label=1 if sum>0
import torch
X = torch.tensor([
[ 2.0, 1.0], # sum = 3.0 (>0) → 1.0
[-1.0, 0.5], # sum = -0.5 (<=0) → 0.0
[-2.0, -3.0], # sum = -5.0 (<=0) → 0.0
[ 0.5, 0.5] # sum = 1.0 (>0) → 1.0
])
y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1)
print(y)
y = torch.where(X[:, 0] + X[:, 1] > 0, 1.0, -1.0).view(-1, 1)Student: view(-1, 1) → reshapes from shape (N,) to (N, 1) but does not turn zeros into -1. is this shape (N,)--a vector and (N, 1) is this a matrix--what is the difference
Teacher C:

v = torch.tensor([1., 2., 3.])
print(v.shape) # torch.Size([3])
m = torch.tensor([[1.], [2.], [3.]])
print(m.shape) # torch.Size([3, 1])
Student: z = X @ w + b---what is @ here
Teacher C:
z = X @ w + b
import torch
X = torch.tensor([[1., 2.],
[3., 4.],
[5., 6.]]) # shape (3, 2)
w = torch.tensor([[0.5],
[1.0]]) # shape (2, 1)
b = torch.tensor([0.1]) # shape (1,)
z = X @ w + b
print(z)
Student: loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))--what is 1e-8 here
Teacher C:
loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))
torch.nn.BCELoss()
# or
torch.nn.BCEWithLogitsLoss()