Skip to content

Pytorch Tensor

igheyas edited this page Aug 15, 2025 · 27 revisions

Part 1 — Thinking in Tensors Tensors are PyTorch’s core data structure (like NumPy arrays, but with GPU & autograd support).

Creating tensors

import torch

# Scalars
a = torch.tensor(3.14)    # default dtype: float32
a*a

Output:

image
import torch

# Scalars
b = torch.tensor(5, dtype=torch.int32)   # integer tensor
b
image
# 1D vector
v = torch.tensor([1, 2, 3])
image
# 2D matrix
m = torch.tensor([[1., 2.], [3., 4.]])
image image
torch.tensor([[1, 2], [3, 4]])     # dtype: torch.int64
torch.tensor([[1., 2.], [3., 4.]]) # dtype: torch.float32
image
m = torch.tensor([[1., 2.], [3., 4.]])
image
m = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
# Random tensors
r = torch.rand(2, 3)                     # uniform [0,1)
n = torch.randn(2, 3)                    # normal dist.

# Shape & dtype
print(r.shape, r.dtype)
image

Broadcasting

image
x = torch.ones(3, 1)    # shape (3,1)
y = torch.ones(1, 4)    # shape (1,4)
z = x + y               # shape (3,4) after broadcasting
print(z)
image image
t = torch.arange(6).reshape(2, 3)
view_t = t.view(3, 2)        # reshapes without copying
copy_t = t.clone().reshape(3, 2)  # makes a copy
image image image image image
<img width="874" height="518" alt="image" src="https://github.com/user-attachments/assets/20a2f622-3d16-4712-8cbc-bab0598aefe1" />
image
t = torch.arange(6)
t

Output

image
t = torch.arange(6).reshape(2,3)
t

Outut:

image
view_t = t.view(3, 2)  # reshapes without copying
t
image image
view_t[0, 0] = 99
print(t)   # the original tensor `t` will now also have 99 in the same memory spot
image
copy_t = t.clone().reshape(3, 2)  # makes a copy
copy_t
image
copy_t[0, 0] = 77
print(t)   # original stays unchanged
image

Part 2 — Autograd Basics

PyTorch tracks operations on tensors with requires_grad=True.

.requires_grad

x = torch.tensor(2.0, requires_grad=True)
y = x**2 + 3*x + 1
y.backward()   # dy/dx computed automatically
print(x.grad)  # derivative at x=2 → 2*x + 3 = 7

Output

image image image image image image image
with torch.no_grad():
    y = x**2  # no gradient history is recorded

Output

image

Student: what is gradient accumulation?

Teacher C: image

import torch

x = torch.tensor(2.0, requires_grad=True)

for _ in range(2):
    y = x * 3
    y.backward()
    print("Grad after backward:", x.grad)
image image
optimizer.zero_grad()  # Preferred when using optimizers
# or
x.grad.zero_()         # For manual param updates
image
accum_steps = 4
optimizer.zero_grad()

for i, (inputs, targets) in enumerate(data_loader):
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()  # accumulate grads

    if (i + 1) % accum_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
image

Student: just to confirm gradient is percentage change in y for 1% change in X--right?

Teacher C:

image image image --- Student: explain gradient clipping ---what is procedure to prevent exploding gradients. Explain torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)--word by word

Teacher C: image image

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
image image image
for batch in data_loader:
    optimizer.zero_grad()
    outputs = model(batch_inputs)
    loss = criterion(outputs, batch_labels)
    loss.backward()

    # Clip gradients BEFORE the optimiser step
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()
image image
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
image
# Data
X = torch.rand(100, 1)
y = 3 * X + 2 + 0.1 * torch.randn(100, 1)  # add small noise

# Params (start random, require grads)
w = torch.randn(1, requires_grad=True)
b = torch.randn(1, requires_grad=True)

lr = 0.1

for epoch in range(100):
    # Forward pass
    y_pred = w * X + b
    loss = torch.mean((y_pred - y)**2)

    # Backward pass
    loss.backward()

    # Update
    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad

    # Zero grads
    w.grad.zero_()
    b.grad.zero_()

print(f"Learned w: {w.item():.3f}, b: {b.item():.3f}")

Student: b = torch.randn(1, requires_grad=True)--explain this word-by-word. b is constant --why is measuring gradient with it

Teacher C:

image
b = torch.randn(1, requires_grad=True)
image image image image image

Student: can you explain # Zero grads w.grad.zero_() b.grad.zero_()---what these two lines doing

image
loss.backward()
image
# Zero grads
w.grad.zero_()
b.grad.zero_()
image
w.grad.zero_()
image
# Suppose gradient from batch 1 is 0.3
# Gradient from batch 2 is 0.2

First backward: w.grad = 0.3
Second backward: w.grad = 0.5  # 0.3 + 0.2 (accumulated!)

image
optimizer = torch.optim.SGD([w, b], lr=0.1)
image

Logistic Regression (LR)

image
# Dummy binary data
X = torch.randn(100, 2)
y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1)  # label=1 if sum>0

w = torch.randn(2, 1, requires_grad=True)
b = torch.randn(1, requires_grad=True)
lr = 0.1

def sigmoid(z):
    return 1 / (1 + torch.exp(-z))

for epoch in range(200):
    # Forward
    z = X @ w + b
    y_pred = sigmoid(z)
    loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))

    # Backward
    loss.backward()

    # Update
    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad

    w.grad.zero_()
    b.grad.zero_()

print("Final loss:", loss.item())

Explain

y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1)  # label=1 if sum>0
image image image image image image image image image image
import torch

X = torch.tensor([
    [ 2.0,  1.0],   # sum =  3.0 (>0) → 1.0
    [-1.0,  0.5],   # sum = -0.5 (<=0) → 0.0
    [-2.0, -3.0],   # sum = -5.0 (<=0) → 0.0
    [ 0.5,  0.5]    # sum =  1.0 (>0) → 1.0
])

y = (X[:, 0] + X[:, 1] > 0).float().view(-1, 1)
print(y)
image image
y = torch.where(X[:, 0] + X[:, 1] > 0, 1.0, -1.0).view(-1, 1)

Student: view(-1, 1) → reshapes from shape (N,) to (N, 1) but does not turn zeros into -1. is this shape (N,)--a vector and (N, 1) is this a matrix--what is the difference

Teacher C: image

v = torch.tensor([1., 2., 3.])
print(v.shape)  # torch.Size([3])
image
m = torch.tensor([[1.], [2.], [3.]])
print(m.shape)  # torch.Size([3, 1])
image image image

Student: z = X @ w + b---what is @ here

Teacher C:

image
z = X @ w + b
image
import torch

X = torch.tensor([[1., 2.],
                  [3., 4.],
                  [5., 6.]])   # shape (3, 2)
w = torch.tensor([[0.5],
                  [1.0]])      # shape (2, 1)
b = torch.tensor([0.1])        # shape (1,)

z = X @ w + b
print(z)
image

Student: loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))--what is 1e-8 here

Teacher C:

image
loss = -torch.mean(y * torch.log(y_pred + 1e-8) + (1-y) * torch.log(1-y_pred + 1e-8))
image image image
torch.nn.BCELoss()
# or
torch.nn.BCEWithLogitsLoss()
image

Home

Clone this wiki locally