# 01 - PyTorch Fundamentals for Chatbot Development

**Duration:** 1-2 hours | **Difficulty:** Beginner

## 🎯 Learning Objectives
- Understand tensor creation, manipulation, and operations
- Learn automatic differentiation with PyTorch's autograd
- Implement basic gradient descent
- Build a simple linear regression model

## 📚 Contents
1. Tensors and Operations
2. Automatic Differentiation
3. Device Management
4. Building Models
5. Linear Regression Exercise

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

torch.manual_seed(42)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.8.0+cpu
CUDA available: False


## 1. Introduction to Tensors

Tensors are the fundamental data structure in PyTorch. Think of them as multi-dimensional arrays that can run on GPUs.

In [44]:
# Creating tensors
scalar = torch.tensor(3.14)
vector = torch.tensor([1, 2, 3, 4, 5])
matrix = torch.tensor([[1, 2], [3, 4]])

print(f"Scalar: {scalar}, shape: {scalar.shape}")
print(f"Vector: {vector}, shape: {vector.shape}")
print(f"Matrix:\n{matrix}\nShape: {matrix.shape}")

# Common tensor creation functions
zeros = torch.zeros(2, 3)
random = torch.randn(2, 3)  # Normal distribution
range_tensor = torch.arange(0, 10, 2)

print(f"\nZeros:\n{zeros}")
print(f"Random:\n{random}")
print(f"Range: {range_tensor}")

Scalar: 3.140000104904175, shape: torch.Size([])
Vector: tensor([1, 2, 3, 4, 5]), shape: torch.Size([5])
Matrix:
tensor([[1, 2],
        [3, 4]])
Shape: torch.Size([2, 2])

Zeros:
tensor([[0., 0., 0.],
        [0., 0., 0.]])
Random:
tensor([[ 0.3367,  0.1288,  0.2345],
        [ 0.2303, -1.1229, -0.1863]])
Range: tensor([0, 2, 4, 6, 8])


In [45]:
# Tensor operations
a = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
b = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32)

print("Element-wise addition:")
print(a + b)

print("\nMatrix multiplication:")
c = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
d = torch.tensor([[7, 8], [9, 10], [11, 12]], dtype=torch.float32)
result = torch.mm(c, d)  # or c @ d
print(f"Result shape: {result.shape}")
print(result)

# Broadcasting example
x = torch.tensor([[1, 2, 3], [4, 5, 6]])
y = torch.tensor([10, 20, 30])
print(f"\nBroadcasting: {x.shape} + {y.shape} = {(x + y).shape}")
print(x + y)

Element-wise addition:
tensor([[ 6.,  8.],
        [10., 12.]])

Matrix multiplication:
Result shape: torch.Size([2, 2])
tensor([[ 58.,  64.],
        [139., 154.]])

Broadcasting: torch.Size([2, 3]) + torch.Size([3]) = torch.Size([2, 3])
tensor([[11, 22, 33],
        [14, 25, 36]])


## 2. Automatic Differentiation (Autograd)

Autograd automatically computes gradients, which is essential for training neural networks.

In [46]:
# Basic autograd example
x = torch.tensor([2.0], requires_grad=True)
y = x**2 + 3*x + 1  # y = x² + 3x + 1

print(f"x = {x.item()}")
print(f"y = {y.item()}")

# Compute gradient: dy/dx = 2x + 3
y.backward()
print(f"dy/dx = {x.grad.item()} (expected: {2*2 + 3})")

# Example with multiple variables
w = torch.tensor([2.0], requires_grad=True)
b = torch.tensor([1.0], requires_grad=True)
x = torch.tensor([3.0])

y = w * x + b
y.backward()

print(f"\nLinear function: y = w*x + b")
print(f"dy/dw = {w.grad.item()} (should be x = {x.item()})")
print(f"dy/db = {b.grad.item()} (should be 1)")

x = 2.0
y = 11.0
dy/dx = 7.0 (expected: 7)

Linear function: y = w*x + b
dy/dw = 3.0 (should be x = 3.0)
dy/db = 1.0 (should be 1)


In [47]:
# Gradient descent demonstration
def gradient_descent_demo():
    # Find minimum of f(x) = (x - 3)² + 1
    x = torch.tensor([0.0], requires_grad=True)
    learning_rate = 0.1
    
    print("Finding minimum of f(x) = (x-3)² + 1")
    print("Iteration | x value | f(x) | gradient")
    
    for i in range(8):
        f_x = (x - 3)**2 + 1
        
        if x.grad is not None:
            x.grad.zero_()
        f_x.backward()
        
        print(f"{i:9d} | {x.item():7.3f} | {f_x.item():6.3f} | {x.grad.item():8.3f}")
        
        with torch.no_grad():
            x -= learning_rate * x.grad
    
    print(f"Final x: {x.item():.3f} (target: 3.000)")

gradient_descent_demo()

Finding minimum of f(x) = (x-3)² + 1
Iteration | x value | f(x) | gradient
        0 |   0.000 | 10.000 |   -6.000
        1 |   0.600 |  6.760 |   -4.800
        2 |   1.080 |  4.686 |   -3.840
        3 |   1.464 |  3.359 |   -3.072
        4 |   1.771 |  2.510 |   -2.458
        5 |   2.017 |  1.966 |   -1.966
        6 |   2.214 |  1.618 |   -1.573
        7 |   2.371 |  1.396 |   -1.258
Final x: 2.497 (target: 3.000)


## 3. Device Management

PyTorch can run on CPU, CUDA GPU, or Apple's MPS for efficient computation.

In [48]:
# Device detection
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")

# Moving tensors to device
cpu_tensor = torch.randn(3, 4)
device_tensor = cpu_tensor.to(device)

print(f"CPU tensor device: {cpu_tensor.device}")
print(f"Device tensor device: {device_tensor.device}")

# Create tensor directly on device
direct_tensor = torch.randn(3, 4, device=device)
print(f"Direct device tensor: {direct_tensor.device}")

Using device: cpu
CPU tensor device: cpu
Device tensor device: cpu
Direct device tensor: cpu


## 4. Building Neural Network Models

Using `nn.Module` to create reusable model components.

In [49]:
class SimpleLinearModel(nn.Module):
    """Simple linear model: y = Wx + b"""
    
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        return self.linear(x)

# Create and test model
model = SimpleLinearModel(2, 1)
test_input = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
output = model(test_input)

print(f"Input shape: {test_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Output:\n{output}")

# Examine parameters
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

Input shape: torch.Size([2, 2])
Output shape: torch.Size([2, 1])
Output:
tensor([[-0.7027],
        [-1.5789]], grad_fn=<AddmmBackward0>)
linear.weight: torch.Size([1, 2])
linear.bias: torch.Size([1])


## 5. Complete Example: Linear Regression

A full training loop demonstrating the concepts we've learned.

In [53]:
# Generate synthetic data
torch.manual_seed(42)
n_samples = 100
true_w, true_b = 2.0, 1.0

X = torch.randn(n_samples, 1)
y = true_w * X + true_b + 0.1 * torch.randn(n_samples, 1)

print(f"Data shape: X={X.shape}, y={y.shape}")
print(f"True parameters: w={true_w}, b={true_b}")

# Create model
model = SimpleLinearModel(1, 1).to(device)
X, y = X.to(device), y.to(device)

# Training setup
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
epochs = 100

# Training loop
losses = []
for epoch in range(epochs):
    # Forward pass
    predictions = model(X)
    loss = criterion(predictions, y)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch:3d}, Loss: {loss.item():.4f}")

# Final parameters
learned_w = model.linear.weight.item()
learned_b = model.linear.bias.item()
print(f"\nLearned parameters: w={learned_w:.3f}, b={learned_b:.3f}")
print(f"True parameters:    w={true_w:.3f}, b={true_b:.3f}")

Data shape: X=torch.Size([100, 1]), y=torch.Size([100, 1])
True parameters: w=2.0, b=1.0
Epoch   0, Loss: 2.3003
Epoch  20, Loss: 1.0446
Epoch  40, Loss: 0.4779
Epoch  60, Loss: 0.2215
Epoch  80, Loss: 0.1052

Learned parameters: w=1.786, b=1.007
True parameters:    w=2.000, b=1.000


In [None]:
# Visualize results
plt.figure(figsize=(12, 4))

# Plot loss curve
plt.subplot(1, 2, 1)
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.grid(True)

# Plot data and learned line
plt.subplot(1, 2, 2)
X_cpu, y_cpu = X.cpu(), y.cpu()
plt.scatter(X_cpu.numpy(), y_cpu.numpy(), alpha=0.6, label='Data')

# Plot true and learned lines
x_line = torch.linspace(X_cpu.min(), X_cpu.max(), 100).unsqueeze(1)
with torch.no_grad():
    y_true = true_w * x_line + true_b
    y_pred = model(x_line.to(device)).cpu()

plt.plot(x_line, y_true, 'r-', label=f'True: y={true_w}x+{true_b}')
plt.plot(x_line, y_pred, 'g--', label=f'Learned: y={learned_w:.2f}x+{learned_b:.2f}')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Results')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 🎉 Congratulations!

You've learned the fundamental PyTorch concepts for chatbot development:

✅ **Tensors**: Multi-dimensional arrays for data representation  
✅ **Operations**: Mathematical operations and broadcasting  
✅ **Autograd**: Automatic differentiation for gradient computation  
✅ **Device Management**: CPU/GPU computation  
✅ **Neural Networks**: Building models with `nn.Module`  
✅ **Training Loop**: Complete learning process  

## 🚀 Next Steps

In the next notebook, we'll explore advanced tensor operations specifically for text processing, including:
- Text tokenization with tensors
- Batch processing techniques
- Memory-efficient operations

**Ready to continue?** Move on to [`02_tensor_operations.ipynb`](02_tensor_operations.ipynb)!