# FP16 summation of small values does not get recorded.

## FP32 (single precission summation) in CPU

In [5]:
import torch

x = torch.tensor(1)
y = torch.tensor(0.0001)
print(x,y, x+y)

tensor(1) tensor(1.0000e-04) tensor(1.0001)


## FP16 summation in CPU

In [2]:
x = torch.tensor(1).half()
y = torch.tensor(0.0001).half()
print(x,y, torch.add(x,y))

RuntimeError: "add_cpu/sub_cpu" not implemented for 'Half'

## FP16 summation in GPU

In [3]:
x = torch.tensor(1).to('cuda').half()
y = torch.tensor(0.0001).to('cuda').half()
print(x,y, torch.add(x,y))

tensor(1., device='cuda:0', dtype=torch.float16) tensor(0.0001, device='cuda:0', dtype=torch.float16) tensor(1., device='cuda:0', dtype=torch.float16)


Incorrect summation because of FP16

## FP16, FP32 (Mixed precission) summation in GPU

In [6]:
x = torch.tensor(1).to('cuda').half()
y = torch.tensor(0.0001).to('cuda').half()
print(x,y, torch.add(x.float(),y))

tensor(1., device='cuda:0', dtype=torch.float16) tensor(0.0001, device='cuda:0', dtype=torch.float16) tensor(1.0001, device='cuda:0')


Correct summation

# Pytorch clone and detach

In [13]:
x = torch.tensor(10.0, requires_grad = True)
y = x.detach().clone()

z = x**2
z.backward()


print(x,y)
print(x.grad,y.grad)

tensor(10., requires_grad=True) tensor(10.)
tensor(20.) None


Clone does not store gradients. In the [FP16 video tutorial](https://youtu.be/9tpLJpqxdE8) optimizer step was done on detached parameters. Optimizer does not step for weights which do not have gradients. Hence the grads had to be synced.

    
https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD

````python
     for p in group['params']:
                if p.grad is None:
                    continue
````