In [1]:
import torch
import torch.nn as nn

### Let's workout some torch tensor basics

First create some values that are easy to see in multiple dimensions

In [2]:
mk = torch.arange(11, 40)

mk

tensor([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
        29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39])

create a 2D stack so we have 1's 2's and 3's (we'll skip some to make this work)

In [3]:
ex = torch.vstack( (mk[0:9], mk[10:19], mk[20:29]))

ex.shape, ex, ex.view(-1)

(torch.Size([3, 9]),
 tensor([[11, 12, 13, 14, 15, 16, 17, 18, 19],
         [21, 22, 23, 24, 25, 26, 27, 28, 29],
         [31, 32, 33, 34, 35, 36, 37, 38, 39]]),
 tensor([11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29,
         31, 32, 33, 34, 35, 36, 37, 38, 39]))

So we were able to stack splits of the original tensor and then flatten that out using view

What happens if we take the Transpose?

In [46]:
ex_t = ex.T
print(ex_t.shape)
print(ex_t)

torch.Size([9, 3])
tensor([[11, 21, 31],
        [12, 22, 32],
        [13, 23, 33],
        [14, 24, 34],
        [15, 25, 35],
        [16, 26, 36],
        [17, 27, 37],
        [18, 28, 38],
        [19, 29, 39]])


In [61]:
tbt = ex_t.view(3, 3, 3)
print(tbt[0])
print(tbt[1])

cat = torch.cat((tbt[0], tbt[1]), 0)
print("torch.cat or those:", cat)
cat[0][0] = 1
print("cat after changing element [0][0]:", cat)
print("the orginal tensors first row is unchanged", tbt[0][0])

tensor([[11, 21, 31],
        [12, 22, 32],
        [13, 23, 33]])
tensor([[14, 24, 34],
        [15, 25, 35],
        [16, 26, 36]])
torch.cat or those: tensor([[11, 21, 31],
        [12, 22, 32],
        [13, 23, 33],
        [14, 24, 34],
        [15, 25, 35],
        [16, 26, 36]])
cat after changing element [0][0]: tensor([[ 1, 21, 31],
        [12, 22, 32],
        [13, 23, 33],
        [14, 24, 34],
        [15, 25, 35],
        [16, 26, 36]])
the orginal tensors first row is unchanged tensor([11, 21, 31])


### Limits on view

the tensor is created with an underlying representation of given size and stride

In [62]:
try:
    ex.T.view(-1)
except Exception as e:
    print(e)

view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.


What happened there?

It's not inherent to the shape of the tensor - if we created a [9, 3] tensor - we can view it fine

In [5]:
ex_v = torch.vstack( (mk[0:3], mk[3:6], mk[6:9], mk[10:13], mk[13:16], mk[16:19], mk[20:23], mk[23:26], mk[26:29]))

ex_v.shape, ex_v, ex_v.view(-1)

(torch.Size([9, 3]),
 tensor([[11, 12, 13],
         [14, 15, 16],
         [17, 18, 19],
         [21, 22, 23],
         [24, 25, 26],
         [27, 28, 29],
         [31, 32, 33],
         [34, 35, 36],
         [37, 38, 39]]),
 tensor([11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29,
         31, 32, 33, 34, 35, 36, 37, 38, 39]))

In [6]:
print(ex_v.T)
try:
    ex_v.T.view(-1)
except Exception as e:
    print(e)

tensor([[11, 14, 17, 21, 24, 27, 31, 34, 37],
        [12, 15, 18, 22, 25, 28, 32, 35, 38],
        [13, 16, 19, 23, 26, 29, 33, 36, 39]])
view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.


again something's going on underneath that doesn't relate to the tensor directly but to some underlying representation

(as an aside using the transpose operation is the same as using the .t function on the tensor)

In [7]:
op_t = torch.transpose(ex_v, 0, 1)
print(op_t)
print(op_t == ex_v.T)

tensor([[11, 14, 17, 21, 24, 27, 31, 34, 37],
        [12, 15, 18, 22, 25, 28, 32, 35, 38],
        [13, 16, 19, 23, 26, 29, 33, 36, 39]])
tensor([[True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True]])


In [8]:
ex.shape, ex_v.T.shape

(torch.Size([3, 9]), torch.Size([3, 9]))

### We can reshape stuff

using reshape allows us to change the underlying representation 

this allows us to 

In [9]:
print(op_t.shape)
ex_v_re = op_t.reshape(9, 3)
print(ex_v_re)
ex_v_re.view(-1)

torch.Size([3, 9])
tensor([[11, 14, 17],
        [21, 24, 27],
        [31, 34, 37],
        [12, 15, 18],
        [22, 25, 28],
        [32, 35, 38],
        [13, 16, 19],
        [23, 26, 29],
        [33, 36, 39]])


tensor([11, 14, 17, 21, 24, 27, 31, 34, 37, 12, 15, 18, 22, 25, 28, 32, 35, 38,
        13, 16, 19, 23, 26, 29, 33, 36, 39])

just to keep track here's what happens if we don't reshape

In [10]:
try: 
    op_t.view(-1)
except Exception as e:
    print(e)

view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.


### The underlying representation is only changed if necessary

reshape doesn't necessarily change the underlying representation...

(f.t.d: *When possible, the returned tensor will be a view of input. Otherwise, it will be a copy*)

so be careful reshape won't always create a copy and you'll need to do that manually if you don't want to change the orignal subsequently 

In [11]:
another_shape = op_t.reshape(3, 3, 3)
another_shape, op_t.view(3, 3, 3)

(tensor([[[11, 14, 17],
          [21, 24, 27],
          [31, 34, 37]],
 
         [[12, 15, 18],
          [22, 25, 28],
          [32, 35, 38]],
 
         [[13, 16, 19],
          [23, 26, 29],
          [33, 36, 39]]]),
 tensor([[[11, 14, 17],
          [21, 24, 27],
          [31, 34, 37]],
 
         [[12, 15, 18],
          [22, 25, 28],
          [32, 35, 38]],
 
         [[13, 16, 19],
          [23, 26, 29],
          [33, 36, 39]]]))

### linear algebra

ok enough on that for the moment

onto some basic linear algebra

In [12]:
ones = torch.ones(3, 2, dtype=float)

twothree = torch.tensor([[2,2,2], [3,3,3]], dtype=torch.float)

ones, twothree

(tensor([[1., 1.],
         [1., 1.],
         [1., 1.]], dtype=torch.float64),
 tensor([[2., 2., 2.],
         [3., 3., 3.]]))

In [15]:
try:
    twothree @ ones
except Exception as e:
    print(e)

expected m1 and m2 to have the same dtype, but got: float != double


another one of those sightly weird things to keep track of... dtype=torch.float != dtype=float

In [16]:
print(ones.dtype, twothree.dtype)

torch.float64 torch.float32


In [17]:
ones = torch.ones(3, 2, dtype=torch.float)

In [18]:
twothree.shape, ones.shape, twothree @ ones

(torch.Size([2, 3]),
 torch.Size([3, 2]),
 tensor([[6., 6.],
         [9., 9.]]))

In [19]:
ones @ twothree

tensor([[5., 5., 5.],
        [5., 5., 5.],
        [5., 5., 5.]])

and just to review the basic operation: a dot b (or a @ b as it's written in torch here)

so twothree @ ones is:

take twothree's first row: twothree[0] = tensor([2., 2., 2.])

and the one's first column [1, 1, 1]

and 2*1 + 2*1 + 2*1 = 6

the same for first row, second column (again 6)

so the first row of the result is: [6, 6]

to form the second row we follow the same sequence to get: [9, 9]

### linear layers

dot product is the way we get our linear layer to work efficiently

a linear layer combines the input with weights to form an output - this is the neuron idea 

and just to be clear for a thing (an input) with 5 values, the neuron needs 5 weights 

for one neuron we get one output

In [21]:
input = torch.ones(5)
w = torch.randn(5)
out = input @ w

print(w)
out.shape, out, w.sum()

tensor([-0.0450,  0.6786,  1.4062,  0.3017,  0.6999])


(torch.Size([]), tensor(3.0414), tensor(3.0414))

so to create multiple neurons we increase the weights dimension

if we have 3 neurons we get 3 outputs

In [22]:
w = torch.randn(5, 3)
out = input @ w

out

tensor([0.5632, 4.3752, 3.5168])

### multidimension input

we can add a channel dimension to the input

say we have a bunch of points

In [23]:
input = torch.tensor([[0, 1], [0.5, 0.866], [0.707, 0.707], [1, 0], [3, 4]], dtype=torch.float)
input.shape

torch.Size([5, 2])

### Lets give these dimensions some names

we have 5 things with 2 dimensions

lets call the 5 things the F dimensoion (for Feature maybe)

and the point dimensions C (for Channel maybe)

We can't just use our new input with the original weights

In [38]:
try:
    out = input @ w

except Exception as e:
    print(e)

mat1 and mat2 shapes cannot be multiplied (5x2 and 5x3)


we might think we can define our weights like:

In [39]:
nw = torch.randn(5, 2, 3)
nw

tensor([[[ 2.2074,  0.2139,  0.3946],
         [ 2.0351, -0.4093, -1.0297]],

        [[ 0.3734,  0.5522,  1.3682],
         [ 0.1672, -0.4822, -0.4234]],

        [[-0.9352, -0.1425,  0.0659],
         [-0.5288,  0.2957, -0.5723]],

        [[-2.0087,  0.2470,  0.2548],
         [ 1.7518,  0.2243,  0.3648]],

        [[ 0.9314,  0.0761,  1.0594],
         [ 1.2530,  0.3352,  0.3342]]])

and we get a dot product ok

In [41]:
out = input @ nw
out.shape, out[0]

(torch.Size([5, 5, 3]),
 tensor([[ 2.0351, -0.4093, -1.0297],
         [ 2.8661, -0.2475, -0.6944],
         [ 2.9994, -0.1381, -0.4490],
         [ 2.2074,  0.2139,  0.3946],
         [14.7624, -0.9954, -2.9351]]))

but that's not really what we wanted is it?

we have 3 neurons so we wanted three activation outputs

To make the neuron thing obvious we'd prefer something like

In [29]:
nw_b = torch.randn(3, 5, 2)
nw_b, nw_b.shape

(tensor([[[ 0.9002,  0.3244],
          [-0.4088, -1.0205],
          [ 0.3538, -0.9612],
          [-0.0234,  0.7453],
          [-1.3945,  0.1509]],
 
         [[-0.2610, -0.6733],
          [-0.1594,  1.0097],
          [ 0.3821, -0.2656],
          [-1.6459, -0.2405],
          [ 0.7149, -0.6207]],
 
         [[-0.7647, -2.4796],
          [ 0.3835,  0.8685],
          [ 1.3706, -0.0041],
          [ 0.3463,  0.6508],
          [-0.0868,  1.8840]]]),
 torch.Size([3, 5, 2]))

In [30]:
# or nw_b.view(3, 2, 5), which will allow dot product
nw_b.view(3, 2, 5)

tensor([[[ 0.9002,  0.3244, -0.4088, -1.0205,  0.3538],
         [-0.9612, -0.0234,  0.7453, -1.3945,  0.1509]],

        [[-0.2610, -0.6733, -0.1594,  1.0097,  0.3821],
         [-0.2656, -1.6459, -0.2405,  0.7149, -0.6207]],

        [[-0.7647, -2.4796,  0.3835,  0.8685,  1.3706],
         [-0.0041,  0.3463,  0.6508, -0.0868,  1.8840]]])

In [43]:
three_outputs = input @ nw_b.view(3, 2, 5)
three_outputs[0]

tensor([[-0.9612, -0.0234,  0.7453, -1.3945,  0.1509],
        [-0.3823,  0.1420,  0.4410, -1.7179,  0.3076],
        [-0.0431,  0.2129,  0.2378, -1.7074,  0.3568],
        [ 0.9002,  0.3244, -0.4088, -1.0205,  0.3538],
        [-1.1440,  0.8798,  1.7545, -8.6395,  1.6649]])

### The wrong path

again not what we want at all:

actually what we do is simpler - looking at a torch linear layer, we're allowed to define the 1D in_features and the 1D out_features

we nearly had that above 3 groups of 10 things (our 10 things in 2 dimensions though)

we just make the features_in = F * C, and the number of neurons is out_features

In [42]:
linl = nn.Linear(5 * 2, 3, bias=False)
linl.weight.data

tensor([[ 0.2478,  0.0591, -0.0774,  0.0024, -0.1309,  0.0833,  0.0797,  0.2091,
          0.0818,  0.3126],
        [-0.1192, -0.0415, -0.0747, -0.2269, -0.2640,  0.1607, -0.2473, -0.2439,
          0.0712,  0.2550],
        [-0.0193,  0.1985, -0.2678, -0.2923,  0.2221, -0.0466, -0.0418,  0.0893,
          0.2043, -0.0953]])

In [45]:
linl(input.view(-1))

tensor([1.5643, 0.6378, 0.1255], grad_fn=<SqueezeBackward4>)

### Next time:

we'll start looking at specific manipulations for our gpt workout