In [1]:
#https://higher.readthedocs.io/en/latest/toplevel.html

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import higher
import numpy as np

In [3]:
np.random.seed(1)
torch.manual_seed(3)
N = 50
actual_multiplier = 5.1
meta_lr = 0.00001
adaptation_steps = 50 # how many iterations in the inner loop we want to do

In [4]:
x = torch.tensor(np.random.random((N,1)), dtype=torch.float64) # features for inner training loop
# print(x)
y = x * actual_multiplier # target for inner training loop

In [5]:
# Model
model = nn.Linear(1, 1, bias=False).double() # simplest possible model - multiple 
# input x by weight w without bias

In [6]:
meta_opt = optim.SGD(model.parameters(), lr=meta_lr, momentum=0.)

In [7]:
def run_inner_loop_once(model, verbose, copy_initial_weights):
    
    lr_tensor = torch.tensor([0.3], requires_grad=True)
    momentum_tensor = torch.tensor([0.5], requires_grad=True)
    opt = optim.SGD(model.parameters(), lr=0.3, momentum=0.5)
    with higher.innerloop_ctx(model, opt, copy_initial_weights=copy_initial_weights, override={'lr': lr_tensor, 'momentum': momentum_tensor}) as (fmodel, diffopt):
        for j in range(adaptation_steps):
            if verbose:
                print('Starting inner loop step j=={0}'.format(j))
                print('    Representation of fmodel.parameters(time={0}): {1}'.format(j, str(list(fmodel.parameters(time=j)))))
                print('    Notice that fmodel.parameters() is same as fmodel.parameters(time={0}): {1}'.format(j, (list(fmodel.parameters())[0] is list(fmodel.parameters(time=j))[0])))
            out = fmodel(x)
            if verbose:
                print('    Notice how `out` is `x` multiplied by the latest version of weight: {0:.4} * {1:.4} == {2:.4}'.format(x[0,0].item(), list(fmodel.parameters())[0].item(), out[0].item()))
            loss = ((out - y)**2).mean()
            diffopt.step(loss)

        if verbose:
            # after all inner training let's see all steps' parameter tensors
            print()
            print("Let's print all intermediate parameters versions after inner loop is done:")
            for j in range(adaptation_steps+1):
                print('    For j=={0} parameter is: {1}'.format(j, str(list(fmodel.parameters(time=j)))))
            print()

        # let's imagine now that our meta-learning optimization is trying to check how far we got in the end from the actual_multiplier
        weight_learned_after_full_inner_loop = list(fmodel.parameters())[0]
        meta_loss = (weight_learned_after_full_inner_loop - actual_multiplier)**2
        print('  Final meta-loss: {0}'.format(meta_loss.item()))
        meta_loss.backward() # will only propagate gradient to original model parameter's `grad` if copy_initial_weight=False
        if verbose:
            print('  Gradient of final loss we got for lr and momentum: {0} and {1}'.format(lr_tensor.grad, momentum_tensor.grad))
            print('  If you change number of iterations "loops" to much larger number final loss will be stable and the values above will be smaller')
        return meta_loss.item()

In [8]:
print("*" * 100)
print('=================== Run Inner Loop First Time (copy_initial_weights=True) =================\n')
meta_loss_val1 = run_inner_loop_once(model, verbose=True, copy_initial_weights=True)
print("\nLet's see if we got any gradient for initial model parameters: {0}\n".format(
    list(model.parameters())[0].grad))
print("#" * 100)
print('=================== Run Inner Loop Second Time (copy_initial_weights=False) =================\n')
meta_loss_val2 = run_inner_loop_once(model, verbose=True, copy_initial_weights=False)
print("\nLet's see if we got any gradient for initial model parameters: {0}\n".format(
    list(model.parameters())[0].grad))
print("*" * 100)

print('=================== Run Inner Loop Third Time (copy_initial_weights=False) =================\n')
final_meta_gradient = list(model.parameters())[0].grad.item()
# Now let's double-check `higher` library is actually doing what it promised to do, not just giving us
# a bunch of hand-wavy statements and difficult to read code.
# We will do a simple SGD step using meta_opt changing initial weight for the training and see how meta loss changed
meta_opt.step()
meta_opt.zero_grad()
meta_step = - meta_lr * final_meta_gradient # how much meta_opt actually shifted inital weight value
meta_loss_val3 = run_inner_loop_once(model, verbose=False, copy_initial_weights=False)

meta_loss_gradient_approximation = (meta_loss_val3 - meta_loss_val2) / meta_step

print()
print('Side-by-side meta_loss_gradient_approximation and gradient computed by `higher` lib: {0:.4} VS {1:.4}'.format(
    meta_loss_gradient_approximation, final_meta_gradient))

****************************************************************************************************

Starting inner loop step j==0
    Representation of fmodel.parameters(time=0): [tensor([[-0.9915]], dtype=torch.float64, requires_grad=True)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=0): True
    Notice how `out` is `x` multiplied by the latest version of weight: 0.417 * -0.9915 == -0.4135
Starting inner loop step j==1
    Representation of fmodel.parameters(time=1): [tensor([[0.1462]], dtype=torch.float64, grad_fn=<AddBackward0>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=1): True
    Notice how `out` is `x` multiplied by the latest version of weight: 0.417 * 0.1462 == 0.06095
Starting inner loop step j==2
    Representation of fmodel.parameters(time=2): [tensor([[1.6401]], dtype=torch.float64, grad_fn=<AddBackward0>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=2): True
    Notice how `out` is `x` mult

    Representation of fmodel.parameters(time=19): [tensor([[5.0910]], dtype=torch.float64, grad_fn=<AddBackward0>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=19): True
    Notice how `out` is `x` multiplied by the latest version of weight: 0.417 * 5.091 == 2.123
Starting inner loop step j==20
    Representation of fmodel.parameters(time=20): [tensor([[5.0951]], dtype=torch.float64, grad_fn=<AddBackward0>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=20): True
    Notice how `out` is `x` multiplied by the latest version of weight: 0.417 * 5.095 == 2.125
Starting inner loop step j==21
    Representation of fmodel.parameters(time=21): [tensor([[5.0981]], dtype=torch.float64, grad_fn=<AddBackward0>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=21): True
    Notice how `out` is `x` multiplied by the latest version of weight: 0.417 * 5.098 == 2.126
Starting inner loop step j==22
    Representation of fmodel.param

In [8]:
print("#" * 100)
print('=================== Run Inner Loop Second Time (copy_initial_weights=False) =================\n')
meta_loss_val2 = run_inner_loop_once(model, verbose=True, copy_initial_weights=False)
print("\nLet's see if we got any gradient for initial model parameters: {0}\n".format(
    list(model.parameters())[0].grad))
print("*" * 100)

print('=================== Run Inner Loop Third Time (copy_initial_weights=False) =================\n')
final_meta_gradient = list(model.parameters())[0].grad.item()
# Now let's double-check `higher` library is actually doing what it promised to do, not just giving us
# a bunch of hand-wavy statements and difficult to read code.
# We will do a simple SGD step using meta_opt changing initial weight for the training and see how meta loss changed
meta_opt.step()
meta_opt.zero_grad()
meta_step = - meta_lr * final_meta_gradient # how much meta_opt actually shifted inital weight value
meta_loss_val3 = run_inner_loop_once(model, verbose=False, copy_initial_weights=False)

meta_loss_gradient_approximation = (meta_loss_val3 - meta_loss_val2) / meta_step

print()
print('Side-by-side meta_loss_gradient_approximation and gradient computed by `higher` lib: {0:.4} VS {1:.4}'.format(
    meta_loss_gradient_approximation, final_meta_gradient))

####################################################################################################

Starting inner loop step j==0
    Representation of fmodel.parameters(time=0): [tensor([[-0.9915]], dtype=torch.float64, grad_fn=<CloneBackward>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=0): True
    Notice how `out` is `x` multiplied by the latest version of weight: 0.417 * -0.9915 == -0.4135
Starting inner loop step j==1
    Representation of fmodel.parameters(time=1): [tensor([[0.1462]], dtype=torch.float64, grad_fn=<AddBackward0>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=1): True
    Notice how `out` is `x` multiplied by the latest version of weight: 0.417 * 0.1462 == 0.06095
Starting inner loop step j==2
    Representation of fmodel.parameters(time=2): [tensor([[1.6401]], dtype=torch.float64, grad_fn=<AddBackward0>)]
    Notice that fmodel.parameters() is same as fmodel.parameters(time=2): True
    Notice how `out` is `x`


Side-by-side meta_loss_gradient_approximation and gradient computed by `higher` lib: 0.0 VS -2.533e-14


In [9]:
def accuracy(predictions, targets):
    predictions = predictions.argmax(dim=1).view(targets.shape)
    return (predictions == targets).sum().float() / targets.size(0)

In [10]:
import torch

In [12]:
print(accuracy(torch.tensor([1,1,1,1,1]), torch.tensor([1,1,1,1,1])))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)