# Install PyTorch and Check Dependencies

In [None]:
!pip install torch torchvision torchaudio

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.23.0-cp313-cp313-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.8.0-cp313-cp313-win_amd64.whl.metadata (7.2 kB)
Collecting filelock (from torch)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting setuptools (from torch)


In [2]:
import torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())

2.8.0+cpu
CUDA available: False


# Tensor

Tensors are a specialised structure that are very similar to arrays and matrices. In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model's parameters.  
***A Tensor is a numerical container of arbitrary dimensions, and it is the core data structure that PyTorch operates on.***

In [3]:
import torch
import numpy as np

## Tensor Initialisation

In [5]:
data = [[1,2],[3,4]]


**Directly from data**:  
Tensors can be created directly from data. The data type is automatically inferred.

In [12]:
print(type(data))
x_data = torch.tensor(data)
print(x_data)
print(type(x_data))

<class 'list'>
tensor([[1, 2],
        [3, 4]])
<class 'torch.Tensor'>


**From A Numpy Arrary**  
Tensors can be created from Numpy arrays (and vice versa).

In [13]:
np_array = np.array(data)
print(type(np_array))
print(np_array)
x_np = torch.from_numpy(np_array)
print(x_np)

<class 'numpy.ndarray'>
[[1 2]
 [3 4]]
tensor([[1, 2],
        [3, 4]])


**From Another Tensor**  
The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.

In [None]:
x_ones = torch.ones_like(x_data) # when using ones_like, the values are always 1
print(f"Ones Tensors: {x_ones}")
x_rand = torch.rand_like(x_data, dtype = torch.float) # when using rand_like, the values are random values from 0 to 1
print(f"Random Tensors: {x_rand}")

Ones Tensors: tensor([[1, 1],
        [1, 1]])
Random Tensors: tensor([[0.9510, 0.7545],
        [0.5291, 0.2837]])


**with Random or Constant Values**  
```shape``` is a tuple of tensor dimensions. In the functions below, it determines the dimensionally of the output tensor.

In [18]:
shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)

print(f"Random Tensor: {rand_tensor}")
print(f"Ones Tensor: {ones_tensor}")
print(f"Zeros Tensor: {zeros_tensor}")

Random Tensor: tensor([[0.3459, 0.3142, 0.5922],
        [0.7493, 0.2923, 0.6013]])
Ones Tensor: tensor([[1., 1., 1.],
        [1., 1., 1.]])
Zeros Tensor: tensor([[0., 0., 0.],
        [0., 0., 0.]])


## Tensor Attributes

Tensor attributes describe their shape, datatype, and the device on which theyr are stored.

In [48]:
tensor = torch.rand(2,3)
print(tensor)
print(f"Shape of Tensor: {tensor.shape}")
print(f"Datatype of Tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")

tensor([[0.4991, 0.1349, 0.9505],
        [0.4508, 0.6830, 0.3670]])
Shape of Tensor: torch.Size([2, 3])
Datatype of Tensor: torch.float32
Device tensor is stored on: cpu


## Tensor Operations

Each of tensors can be run on the GPU.

In [55]:
if torch.cuda.is_available():
    tensor = tensor.to("cuda")
    tensor = tensor.to("cpu")
print(f"Device tensor is stored on: {tensor.device}")

Device tensor is stored on: cpu


**Standard Numpy-like Indexing and Slicing**  

In [62]:
tensor= torch.ones(4,4)
print(tensor)
tensor[:, 1] = 0
print(tensor)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])
tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])


**Joinining Tensors**  
You can use torch.cat to concatenate a sequence of tensors along a given dimensions.  
*NOTE*
1. The dimension you choose can have different lengths, because that is the one we are extending.
2. All the other dimensions must be the same, otherwise, it is ike trying to stack Lego blocks of different sizes and they will not fit.  

**torch.cat() VS torch.stack()**    
|Operation|Result|Shape|Characteristics|
|---------|------|-----|---------------|
|torch.cat([a,b], dim = 0)|[[1,2,3],[4,5,6]]|(2,3)|Concatenates along rows -> stacked vertically|
|torch.cat([a,b], dim=1)|[[1,2,3,4,5,6]]|(1,6)|Concatenates along columns -> stacjked horizontally|
torch.stack([a,b].dim=0)|[[[1,2,3]],[[4,5,6]]]|(2,1,3)|Creates a new dimension at the front -> 3D tensor|
|torch.stack([a,b], dim=1)|[[[1,2,3],[4,5,6]]]|(1,2,3)|Creates a new dimension in the middle -> 1 block of shape $(2 \times 3)$|

In [67]:
tensor1 = torch.ones(2,3)
tensor2 = torch.zeros(2,4)
tensor_cat = torch.cat((tensor1, tensor2),dim = 1)
print(tensor_cat)

tensor([[1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0.]])


**Multiplying Tensors**  
1. Element-wise Product: tensor1* tensor2, tensor1.mul(tensor2)
2. Matrix multiplication: tensor11.matmul(tensor2), tensor1 @ tensor2

In [None]:
# element-wise product
tensor1 = torch.tensor([[1,2], [1,2]])    
tensor2 = torch.tensor([[1,2], [3,4]])
# method 1
tensor3 = tensor1 * tensor2
# method 2
tensor4 = torch.mul(tensor1, tensor2)
print(tensor3)
print(tensor4)


tensor([[1, 4],
        [3, 8]])
tensor([[1, 4],
        [3, 8]])


In [78]:
# matrix multiplication
tensor1 = torch.tensor([[1,2], [1,2]])    
tensor2 = torch.tensor([[1,2], [3,4]])
# method 1
tensor3 = tensor1 @ tensor2
# method 2
tensor4 = tensor1.matmul(tensor2)
print(tensor3)
print(tensor4)

tensor([[ 7, 10],
        [ 7, 10]])
tensor([[ 7, 10],
        [ 7, 10]])


**In-place Operations**  
Operations that have a ```_``` suffix are in-place, e.g. ```x.copy_(y)```, ```x.t_()```, will change ```x```.  
*NOTE*  
In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss of history. Hence, their use is discouraged.

In [87]:
tensor = torch.rand(5,5)
print(tensor)
tensor_add5 = tensor.add_(5)
print(tensor_add5)

tensor([[0.6919, 0.7100, 0.2123, 0.8367, 0.9037],
        [0.2508, 0.6395, 0.2129, 0.0154, 0.5361],
        [0.5801, 0.8836, 0.8760, 0.1594, 0.7700],
        [0.7479, 0.5176, 0.7220, 0.5189, 0.3494],
        [0.8948, 0.6106, 0.3327, 0.3970, 0.7814]])
tensor([[5.6919, 5.7100, 5.2123, 5.8367, 5.9037],
        [5.2508, 5.6395, 5.2129, 5.0154, 5.5361],
        [5.5801, 5.8836, 5.8760, 5.1594, 5.7700],
        [5.7479, 5.5176, 5.7220, 5.5189, 5.3494],
        [5.8948, 5.6106, 5.3327, 5.3970, 5.7814]])


In [None]:
tensor.t_() # transpose and change the original tensor
print(tensor)


tensor([[5.6919, 5.2508, 5.5801, 5.7479, 5.8948],
        [5.7100, 5.6395, 5.8836, 5.5176, 5.6106],
        [5.2123, 5.2129, 5.8760, 5.7220, 5.3327],
        [5.8367, 5.0154, 5.1594, 5.5189, 5.3970],
        [5.9037, 5.5361, 5.7700, 5.3494, 5.7814]])


In [94]:
tensor_new = torch.rand(5,5)
tensor.copy_(tensor_new) # using copy_ to change the original tensor
# the size of the tensor is the same as the original tensor

tensor([[0.0101, 0.9939, 0.5913, 0.6275, 0.1166],
        [0.1435, 0.4948, 0.6813, 0.6999, 0.7142],
        [0.2320, 0.6258, 0.4327, 0.4303, 0.9632],
        [0.8310, 0.0059, 0.4553, 0.0558, 0.1104],
        [0.5218, 0.7494, 0.9381, 0.4372, 0.4053]])

## Bridge with Numpy

Tensors on the CPU can share their underlying memory locations, and changing one will change the other.

### Tensor to Numpy

In [100]:
tensor1 =  torch.ones(5)
print(tensor1)
n_array = tensor1.numpy()
print(n_array)

tensor([1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1.]


In [101]:
tensor1.add_(1)
print(tensor1)
print(n_array)

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [102]:
np.add(n_array, 1, out = n_array)
print(n_array)
print(tensor1)

[3. 3. 3. 3. 3.]
tensor([3., 3., 3., 3., 3.])


### Numpy Array to Tensor

In [103]:
n_array = np.ones(5)
tensor1 = torch.from_numpy(n_array)
print(n_array)
print(tensor1)

[1. 1. 1. 1. 1.]
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)


In [104]:
np.add(n_array, 1, out = n_array)
print(n_array)
print(tensor1)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [105]:
tensor1.add_(2)
print(n_array)
print(tensor1)

[4. 4. 4. 4. 4.]
tensor([4., 4., 4., 4., 4.], dtype=torch.float64)


# Autograd

```torch.autograd``` is PyTorch's automatic differentiation engine that powers neural network training.

**Background**  
Neural Networks are a collection of nested functions that are executed on some input data. These functions are defined by parameters (consisting weights and bias), which in PyTorch are stored in tensors.   
Training a neural network happens in two steps:
1.**Forward Propagation**: In forward propagation, the NN makes its best guess about the correct output. It runs the input data through each of its functions to make this guess.
2. **Backward Propagation**: In back propagation, the NN adjusts its parameters proportionate to the errors in its guess. It does this by traversing backwards from the output, collecting the derivatives of the error with respect to the parameters of functions (gradients), and optmising the parameters using gradient descent.

## Example

For example, we load a pretrained resnet18 model from ```torchvision```. We create a random data tensor to represent a single image with 3 channels, and height & width of 64, and its corresponding ```label``` initialised to some random values. Lables in pretrained models has shape (1,1000).

In [None]:
import torch
from torchvision.models import resnet18, ResNet18_Weights
model = resnet18(weights = ResNet18_Weights.DEFAULT)
data = torch.rand(1,3,64,64)
labels = torch.rand(1,1000)

3.6%

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\junqi.wu/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


100.0%


Next, we run the input data through the model through each of its layer to make a prediction. This is the forward pass.

In [None]:
prediction = model(data)

Then, we use the model's prediction and the corresponding label to calculate the error (loss). The next step is to backpropagate this error through the network. Backward propagation is kicked off when we call ```.backward()``` on the error tensor. Autograd then calculates and stores the gradients for each model parameter in the parameter's ```.grad``` attribute.

In [109]:
loss = (prediction - labels).sum()
loss.backward()

Next, we load an optimiser, in this case SGD with a learning rate of 0.01 and momentum of 0.9. We register all the parameters of the model in the optimiser.

In [110]:
optim = torch.optim.SGD(model.parameters(), lr = 1e-2, momentum = 0.9)

Finally, we call ```.step()``` to initiate gradient descent. The optimiser adjusts each parameter by its gradient stored in ```.grad```.

In [112]:
optim.step()

In [116]:
a = torch.tensor([2.,3.], requires_grad = True)
b = torch.tensor([6.,4.], requires_grad = True)
Q = 3*a**3 - b**2
print(Q)
external_grad = torch.tensor([1.,1.])
Q.backward(gradient = external_grad)
print(a.grad)
print(b.grad)
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([-12.,  65.], grad_fn=<SubBackward0>)
tensor([36., 81.])
tensor([-12.,  -8.])
tensor([True, True])
tensor([True, True])
