In [0]:
import torch
from torch import tensor, manual_seed, rand
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly import express as px

# Basic use of tensor library

Tensors are to the torch package what arrays are to numpy. If you are familiar with one you can easily handle the other.

# Create one-dimensional tensor

In [74]:
_123 = tensor([1,2,3])
_123

tensor([1, 2, 3])

# ... multiply with a constant

In [75]:
_123 * 7

tensor([ 7, 14, 21])

# ... dot product

In [0]:
_456 = tensor([4,5,6])

## Dot product by hand:
Let's caluculate it "by hand":

1*4 + 2*5 + 3*6 = 32

## Dot product basic:
Calculate it using basic operators: Multiply the elements of the vectors, and sum up:

In [0]:
dot_basic = (_123 * _456).sum()
assert tensor(32) == dot_basic, 'Should match dot_basic'

## Dot product elegant:
Now let's calculate the dot product with the matrix @ operator

In [0]:
dot_elegant = _123 @ _456
_123 = tensor([1,2,3])
assert tensor(32) == dot_elegant, 'Should match dot_elegant'

## Random Matrix

In [79]:
# set seed for reproducibility
manual_seed(314)

rand(size=[2,3])

tensor([[0.7196, 0.6295, 0.6667],
        [0.3385, 0.8522, 0.3126]])

# Random Indicator Matrix

In [80]:
# A particular use case might be an indicator matrix of for example
# which words are in which documents

# Let's generate a random matrix for such a use case
number_of_documents = 17
number_of_words = 3

# probability of a particular word being in a particular document
p = 0.2

manual_seed(123)
indicator = (rand(size=[number_of_documents, number_of_words])<0.2).int()
indicator

tensor([[0, 0, 0],
        [0, 1, 0],
        [1, 1, 1],
        [0, 0, 0],
        [1, 1, 0],
        [0, 1, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 1],
        [0, 0, 0],
        [0, 0, 0],
        [1, 0, 1],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], dtype=torch.int32)

# Magic Differentiation
This part relies on the magic of [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation).
This magic is available in pytorch via the package autograd. This magic underpins all the recent successes of neural networks, because with it you dont have the tedius calculation by hand of dreivatives which would otherwise be necessary. Derivatives in turn are of course necessary for the Stochastic Gradient Descent which is the Workhorse Opimisation Technique of Deep Learning.  Without Automatic Differentiation thare would be no Deep Learning Renaissance. So let's do it. Let's use the magic.

In [0]:
x = tensor(3.,requires_grad=True)
y = tensor(7.,requires_grad=True)
z = x * y**5

# calculate derivatives
z.backward()

# We have 

- z = x*y**5

Therefore the derivative of z using calculus we know that:

- with respect to x is y**5
- with respect to y is x * 5 * y**4

Remark: In technical lingo these "derivatives with respect to ..." are known as partial derivatives.

Let's check whether we can calculate this using pytorch.

First we note some values

In [82]:
7**5, 5 * 7**4, 3 * 5 * 7**4

(16807, 12005, 36015)

## Let's check the derivative with respect to x

In [0]:
assert x.grad == tensor(16807), 'Derivative of z with respect to x should match 16807'
assert x.grad == tensor(7**5), 'Derivative of z with respect to x should match 7^5'
assert x.grad == y**5, 'Derivative of z with respect to x should match y^5'

# Let's check the derivative with respect to y

In [0]:
y.grad == tensor(36015)
assert y.grad == tensor(36015), 'Derivative of z with respect to y should match 36015'
assert y.grad == tensor(3 * 5 * 7**4), 'Derivative of z with respect to y should match 3 * 5 * 7**4'
assert y.grad == 3 * 5 * y**4, 'Derivative of z with respect to y should match 3 * 5 * y**4'

# Let's ramp it up a little, calculating the derivatives of a function

In [85]:
x = torch.linspace(0, 2*math.pi, requires_grad=True)
y = torch.sin(x)

# https://stackoverflow.com/questions/55749202/getting-gradient-of-vectorized-function-in-pytorch
y.backward(torch.ones_like(x)) # here x must be explicitly named because it is not a scalar
x.grad

tensor([ 1.0000,  0.9980,  0.9920,  0.9819,  0.9679,  0.9501,  0.9284,  0.9029,
         0.8738,  0.8413,  0.8053,  0.7660,  0.7237,  0.6785,  0.6306,  0.5801,
         0.5272,  0.4723,  0.4154,  0.3569,  0.2969,  0.2358,  0.1736,  0.1108,
         0.0476, -0.0159, -0.0793, -0.1423, -0.2048, -0.2665, -0.3271, -0.3863,
        -0.4441, -0.5000, -0.5539, -0.6056, -0.6549, -0.7015, -0.7453, -0.7861,
        -0.8237, -0.8580, -0.8888, -0.9161, -0.9397, -0.9595, -0.9754, -0.9874,
        -0.9955, -0.9995, -0.9995, -0.9955, -0.9874, -0.9754, -0.9595, -0.9397,
        -0.9161, -0.8888, -0.8580, -0.8237, -0.7861, -0.7453, -0.7015, -0.6549,
        -0.6056, -0.5539, -0.5000, -0.4441, -0.3863, -0.3271, -0.2665, -0.2048,
        -0.1423, -0.0793, -0.0159,  0.0476,  0.1108,  0.1736,  0.2358,  0.2969,
         0.3569,  0.4154,  0.4723,  0.5272,  0.5801,  0.6306,  0.6785,  0.7237,
         0.7660,  0.8053,  0.8413,  0.8738,  0.9029,  0.9284,  0.9501,  0.9679,
         0.9819,  0.9920,  0.9980,  1.00

In [0]:
X = x.detach().numpy()
Y = y.detach().numpy()
dY_vs_dX = x.grad.detach().numpy()


In [106]:
data = []
trace0 = go.Scatter(x=X, y=Y, name='Sin')
trace1 = go.Scatter(x=X, y=dY_vs_dX, name='Aut. diff of Sin', mode='markers')
trace2 = go.Scatter(x=X, y=np.cos(X), name='Cos', mode='lines')

data.append(trace0)
data.append(trace1)
data.append(trace2)

layout = go.Layout(title='Automatic Differentiation of function')


fig = go.Figure(data=data, layout=layout)
fig.show()

# As you can see by zooming in, the result of of automatic differentation matches faithfully the expected cos function. Hurray!

This is the magic in action!


# Let's dress it up as an optimisation with a loss function

We may also dress this calculation up in a fashion that is more akin to the way neural networks optimisations are done. This is by way of using a loss function. However to achieve the desired derivative, we need a loss function that is atypical.


In [0]:
# define special loss function
# Attention 
def myloss(y_hat, y): return (y_hat-y).sum()

In [0]:
x = torch.linspace(0, 2*math.pi, requires_grad=True)
y = torch.sin(x)

Y_hat = y
Y_true = 0 * Y_hat

In [0]:

loss = myloss(Y_hat, Y_true)
loss.backward()

In [91]:
import plotly.graph_objects as go
from plotly import express as px

data = []
trace0 = go.Scatter(x=X, y=Y, name='Sin')
trace1 = go.Scatter(x=X, y=dY_vs_dX, name='Aut. diff of Sin', mode='markers')
trace2 = go.Scatter(x=X, y=np.cos(X), name='Cos', mode='lines')
trace3 = go.Scatter(x=X, y=x.grad.detach().numpy(), name='Aut. diff of Sin loss', mode='lines+markers')

data.append(trace0)
data.append(trace1)
data.append(trace2)
data.append(trace3)

layout = go.Layout(title='Automatic Differentiation of function')

fig = go.Figure(data=data, layout=layout)
fig.show()


# Again one can see that the result of the second automatic differentation using the loss function matches faithfully the expected cos function. Hurray!

# Now we have the necessary pieces to do some interessting Matrix calculations
Some interesting references: http://bytepawn.com/pytorch-basics-solving-the-axb-matrix-equation-with-gradient-descent.html

https://arthought.com/comparison-of-a-very-simple-regression-in-tensorflow-and-keras/


https://github.com/hfwittmann/comparison-tensorflow-keras/blob/master/04%20-%20Custom_training_basics_standard_optimizer_pytorch.ipynb


# Matrix inverse

In [92]:
M1234 = torch.diag(tensor([1.,2.,3.,4.]))
M1234

tensor([[1., 0., 0., 0.],
        [0., 2., 0., 0.],
        [0., 0., 3., 0.],
        [0., 0., 0., 4.]])

## Let's use the builtin method first

In [93]:
inverse = torch.inverse(M1234)
inverse

tensor([[1.0000, 0.0000, 0.0000, -0.0000],
        [0.0000, 0.5000, 0.0000, -0.0000],
        [0.0000, 0.0000, 0.3333, -0.0000],
        [0.0000, 0.0000, 0.0000, 0.2500]])

In [94]:
# Let's check it's really the inverse ... of course it is!
inverse @ M1234

tensor([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

# Now let's use gradient descent to do the same

So what we want a matrix that - multiplied by the original matrix M1234 - yields the identity matrix

In [95]:
identity = torch.diag(torch.tensor([1,1,1,1])).float()
Y_true = identity
Y_true

tensor([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

In [0]:
manual_seed(314)
random_inverse = torch.rand(size=[4,4], requires_grad=True)
# random_inverse = torch.nn.Parameter(random_inverse) 

In [97]:
Y_hat = random_inverse @ M1234
Y_hat

tensor([[0.7196, 1.2590, 2.0002, 1.3540],
        [0.8522, 0.6251, 1.5017, 1.8571],
        [0.0083, 0.8938, 2.4086, 2.2047],
        [0.3556, 0.7420, 2.0490, 1.2297]], grad_fn=<MmBackward>)

Of course random_inverse, is not yet the inverse, therefore Y_hat is not the identity matrix. 


---

A useful distance is the element-wise mse

In [0]:
def mse(y, y_hat): return ((y-y_hat)**2).mean()

In [0]:
loss = mse(Y_true, Y_hat)

In [0]:
loss.backward()

In [101]:
random_inverse.grad

tensor([[-3.5046e-02,  3.1475e-01,  7.5007e-01,  6.7699e-01],
        [ 1.0653e-01, -9.3718e-02,  5.6315e-01,  9.2856e-01],
        [ 1.0338e-03,  2.2346e-01,  5.2824e-01,  1.1024e+00],
        [ 4.4448e-02,  1.8549e-01,  7.6836e-01,  1.1486e-01]])

In [0]:
G=random_inverse.grad.detach().numpy()

Let us use gradient descent with respect to the random_inverse and mse as the loss function to approximate the inverse matrix.

In [0]:
lrs = np.log(np.logspace(2e-1, 2e-1, 100))
losses = []

def update(lr):
  Y_hat = random_inverse@M1234
  loss = mse(Y_true, Y_hat)
  loss.backward()
  losses.append(loss.detach().numpy())


  if t%20==0:
    print(t,':', loss)
    Y_hat = random_inverse @ M1234
    G=Y_hat.detach().numpy()
    fig = px.imshow(G)
    fig.update_yaxes(showticklabels=False)
    fig.show()
    print(G)

  with torch.no_grad():
    random_inverse.sub_(lr * random_inverse.grad)
    random_inverse.grad.zero_()


In [104]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
manual_seed(314)
random_inverse = torch.rand(size=[4,4])
random_inverse = torch.nn.Parameter(random_inverse) 
for t, lr in enumerate(lrs): update(lr)


0 : tensor(1.6650, grad_fn=<MeanBackward0>)


[[0.72 1.26 2.00 1.35]
 [0.85 0.63 1.50 1.86]
 [0.01 0.89 2.41 2.20]
 [0.36 0.74 2.05 1.23]]
20 : tensor(0.0054, grad_fn=<MeanBackward0>)


[[0.91 0.01 0.00 0.00]
 [0.26 1.00 0.00 0.00]
 [0.00 0.00 1.00 0.00]
 [0.11 0.00 0.00 1.00]]
40 : tensor(0.0005, grad_fn=<MeanBackward0>)


[[0.97 0.00 0.00 0.00]
 [0.08 1.00 0.00 0.00]
 [0.00 0.00 1.00 0.00]
 [0.03 0.00 0.00 1.00]]
60 : tensor(4.7337e-05, grad_fn=<MeanBackward0>)


[[0.99 0.00 0.00 0.00]
 [0.02 1.00 0.00 -0.00]
 [0.00 0.00 1.00 0.00]
 [0.01 0.00 0.00 1.00]]
80 : tensor(4.4184e-06, grad_fn=<MeanBackward0>)


[[1.00 0.00 0.00 0.00]
 [0.01 1.00 0.00 -0.00]
 [0.00 0.00 1.00 0.00]
 [0.00 0.00 0.00 1.00]]


In [105]:
import pandas as pd

import plotly.express as px
df = pd.DataFrame({'losses':np.array(losses).flatten()})
px.line(df, y='losses', log_y=True)