In [2]:
import torch
import time
# Import PyTorch's neural network module
import torch.nn as nn
import numpy as np
device= 'cuda' if torch.cuda.is_available() else 'cpu'
device


'cuda'

## GPU vs CPU parallel computing
#### Torch uses GPU, whereas numpy uses CPU. lets compare the speed for a 4 dimensional 

In [3]:
torch_rand = torch.rand(100, 100,100,100).to(device)
tord_rand_2 = torch.rand(100, 100,100,100).to(device)
np_rand = np.random.rand(100, 100,100,100)
np_rand_2 = np.random.rand(100, 100,100,100)

start_time = time.time()

rand = torch_rand @ tord_rand_2
end_time = time.time()
elapsed_time = end_time - start_time
print(f"torch time: {elapsed_time} seconds")

start_time = time.time()
rand=np.multiply(np_rand, np_rand_2)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"numpy time: {elapsed_time} seconds")










torch time: 0.03594350814819336 seconds
numpy time: 0.18724703788757324 seconds


In [4]:
%%time
start_time= time.time()
zeros=torch.zeros(1,1)
end_time =  time.time()
elapsed_time = end_time - start_time
print(f"torch time: {elapsed_time} seconds")    



torch time: 0.001001596450805664 seconds
CPU times: total: 0 ns
Wall time: 1 ms


## Creating tensor of randomly sampled indices based on probability

#### Eg: index 0 has a 10%, index 1 has 20%, etc.

In [5]:
# This line creates a PyTorch tensor containing probabilities that sum to 1.0
probabilities = torch.tensor([0.1, 0.2, 0.3, 0.4])
# torch.multinomial performs weighted random sampling from the input tensor (probabilities)
# - probabilities: input tensor containing probabilities for each element
# - num_samples: number of samples to draw (10 in this case)
# - replacement=True: allows sampling the same index multiple times
#
# For example, with probabilities [0.1, 0.2, 0.3, 0.4]:
# - Index 0 (0.1) has 10% chance of being selected
# - Index 1 (0.2) has 20% chance of being selected
# - Index 2 (0.3) has 30% chance of being selected  
# - Index 3 (0.4) has 40% chance of being selected

samples = torch.multinomial(probabilities, num_samples=10, replacement=True)
samples  # Returns tensor of 10 randomly sampled indices based on the probabilities


tensor([1, 1, 1, 0, 3, 0, 3, 2, 1, 2])

## Concatenating tensors

In [6]:
tensor = torch.tensor([1,2,3,4,5])
out = torch.cat((tensor, tensor), dim=0)
out


tensor([1, 2, 3, 4, 5, 1, 2, 3, 4, 5])

## Lower Upper triangular matrices

In [7]:
# Create a sample matrix
matrix = torch.tensor([[1, 2, 3], 
                      [4, 5, 6],
                      [7, 8, 9]])

# Get lower triangular part using tril()
lower_triangular = torch.tril(matrix)
print("Lower triangular matrix:")
print(lower_triangular)

# Get upper triangular part using triu()
upper_triangular = torch.triu(matrix)
print("Upper triangular matrix:")
print(upper_triangular)


Lower triangular matrix:
tensor([[1, 0, 0],
        [4, 5, 0],
        [7, 8, 9]])
Upper triangular matrix:
tensor([[1, 2, 3],
        [0, 5, 6],
        [0, 0, 9]])


## Attention masks used in attention mechanisms
- Here we create a boolean mask to fill  the upper triangular (including diagonal) with -infinity

- Lower keeps 0

In [8]:

# 1.create a 5x5 matrix of zeros
base_matrix = torch.zeros(5,5)

# 2. Create a 5x5 matrix of ones and get its lower triangular part
ones_matrix = torch.ones(5,5)
lower_tri_mask = torch.tril(ones_matrix)
# This creates a mask where:
# - Lower triangle (including diagonal) = 1
# - Upper triangle = 0

# 3. Create a boolean mask where True is for positions we want to fill
mask = (lower_tri_mask == 0)
# This makes the upper triangle True and lower triangle False

# 4. Fill masked positions (upper triangle) with negative infinity
out = base_matrix.masked_fill(mask, float('-inf'))
# This means:
# - Lower triangle keeps its zeros
# - Upper triangle becomes negative infinity

print(out)
# The result is a 5x5 matrix where:
# - Lower triangle (including diagonal) contains zeros
# - Upper triangle contains negative infinity
# This is commonly used in attention mechanisms to create attention masks


tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])


## Exponantials to the matrix

In [9]:
# Apply exponential function to the matrix
# This transforms:
# - zeros in lower triangle to ones (e^0 = 1)
# - negative infinity in upper triangle to zeros (e^-inf ≈ 0)
result = torch.exp(out)
print(result)



tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])


## Transposing

In [10]:
input = torch.zeros(2,3,4)
out=input.transpose(1,2)
print(out.shape)



torch.Size([2, 4, 3])


## Stacking tensors, stacking dimensions
- Appearantly tensors use 0-based indexing. 
- So this way the final tensor is a 2D tensor
- dim=0 stacks vertically, each tensor becomes a row. dim=1 stacks horizontally

In [11]:
# Create three 1D tensors
tensor1 = torch.tensor([1,2,3])      # Shape: (3,)
tensor2 = torch.tensor([4,5,6])      # Shape: (3,)
tensor3 = torch.tensor([7,8,9])      # Shape: (3,)

# Stack the tensors along dimension 0 (creating a new first dimension)
# This combines the tensors into a single 2D tensor
# Shape becomes: (3, 3) - 3 rows of 3 elements each
stacked_tensor = torch.stack((tensor1, tensor2, tensor3), dim=0)

# Display the resulting 2D tensor
# Will show:
# [[1, 2, 3],
#  [4, 5, 6], 
#  [7, 8, 9]]
stacked_tensor


tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

## Linear combinations with random weights
- First create a linear transformation layer of 3 inputs and 3 outputs
- bias means no constant term
#### $\begin{bmatrix} 10 & 10 & 10 \end{bmatrix} \times \begin{bmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \\ w_{31} & w_{32} & w_{33} \end{bmatrix} = \begin{bmatrix} output_1 & output_2 & output_3 \end{bmatrix}$
##### Here each input is multiplied by a weight. All the products are then added together. The weights determine how much each input contributes to each output

- Fundamental component of neural networks
- Dimensions currently 3 but we can change to (3,5) to transform 3 features to 5
- Helps learn complex patterns through multiple layers
- Weights can be learned and optimise to minimise prediction errors


In [12]:


# Create a sample input tensor with 3 values of 10.0
sample = torch.tensor([10.,10.,10.])  # Input: [10, 10, 10]

            # Output: [w11*10 + w12*10 + w13*10,
            #          w21*10 + w22*10 + w23*10,
            #          w31*10 + w32*10 + w33*10]


# Create a linear layer that:
# - Takes 3 input features
# - Outputs 3 features 
linear = nn.Linear(3, 3, bias=False)

# Pass the sample through the linear layer
# This performs the operation: output = input * weights
# Where weights is a 3x3 matrix initialized randomly
out = linear(sample)

# Print the output
# This will show 3 values that are linear combinations 
# of the input [10,10,10] with the random weights
print(out)






tensor([-5.6242,  7.4097, -3.5206], grad_fn=<SqueezeBackward4>)


## Softmax Function
- Takes numbers and converts them to probabilities that sum to 1
- Makes larger values more significant
- Helps with gradient flow in nn
- e^x ensures exponential growth in values
- Smooth, differentiable

In [13]:
import torch.nn.functional as F

# Create a sample tensor of scores/logits
sample = torch.tensor([1., 2., 3., 4., 5.])

# Step 1: Calculate e^x for each value
exp_values = torch.exp(sample)
print("e^x values:")
print(exp_values)  # [e^1, e^2, e^3, e^4, e^5]

# Step 2: Calculate sum of all e^x values
sum_exp = exp_values.sum()
print("\nSum of e^x values:", sum_exp)

# Step 3: Divide each e^x by the sum to get probabilities
softmax = exp_values / sum_exp
print("\nSoftmax probabilities (e^x / sum(e^x)):")
print(softmax)

# Verify the calculation matches torch's implementation
torch_softmax = F.softmax(sample, dim=0)
print("\nTorch's softmax implementation:")
print(torch_softmax)

# Verify probabilities sum to 1
print("\nSum of probabilities:", softmax.sum().item())

# The larger the input value, the larger the output probability
# This shows how softmax amplifies differences between values
print("\nOriginal values:", sample)
print("Final probabilities:", softmax)


e^x values:
tensor([  2.7183,   7.3891,  20.0855,  54.5981, 148.4132])

Sum of e^x values: tensor(233.2042)

Softmax probabilities (e^x / sum(e^x)):
tensor([0.0117, 0.0317, 0.0861, 0.2341, 0.6364])

Torch's softmax implementation:
tensor([0.0117, 0.0317, 0.0861, 0.2341, 0.6364])

Sum of probabilities: 1.0

Original values: tensor([1., 2., 3., 4., 5.])
Final probabilities: tensor([0.0117, 0.0317, 0.0861, 0.2341, 0.6364])


## Embedding vectors

In [14]:
#initialise embedding layer
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3)

#create a sample tensor of indices
indices = torch.tensor([1,2,3,4])

#pass indices to embedding layer
embedded_output = embedding(indices)

#print embedded output
print(embedded_output)




tensor([[-0.8127,  0.3105, -0.7489],
        [ 0.1444,  0.7144,  0.2271],
        [-1.0443, -0.6564,  0.3397],
        [-0.4447, -0.2291,  0.4641]], grad_fn=<EmbeddingBackward0>)


## Matrix dot product

In [15]:
a = torch.tensor([[1,2],[3,4], [5,6]])
b = torch.tensor([[7,8,9], [10,11,12]])

res = torch.matmul(a,b)
res


tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])

## Multiplying int and float don't work

In [16]:
int_64 = torch.randint(1,(3,2)).float()
float_32 = torch.rand(2,3)
result = torch.matmul(int_64, float_32)

print('hello')

hello


res = torch.