# An Episodic Training Example
## Dataloaders setup

In [1]:
from data.unified_emotion.unified_emotion import unified_emotion
from transformers import AutoTokenizer

In [2]:
unified = unified_emotion("./data/datasets/unified-dataset.jsonl")

unified.prep()

## Model setup

In [3]:
from transformers import BertConfig

from modules.mlp_clf import SF_CLF


# Pre-trained model name (from Huggingface)
bert_name = 'bert-base-uncased'
# Pre-trained configuration from Huggingface
bert_config = BertConfig.from_pretrained(bert_name)
# Max to layer keep frozen. 11 keeps model frozen, -1 makes BERT totally trainable
nu = 11

# MLP hidden layers
hidden_dims = [512, 256, 128]
# Which activation to use. Currently either tanh or ReLU
act_fn = 'ReLU'

config = {'bert_name': bert_name, 
'bert_config': bert_config, 
'nu': nu,
'hidden_dims': hidden_dims,
'act_fn': act_fn
}

tokenizer = AutoTokenizer.from_pretrained(bert_name)


In [4]:
from models.meta_bert import MetaBert

model = MetaBert(config)

print(model)

MetaBert(
  (encoder): BertSequence(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                

# MAML

In [5]:
import higher
import torch.nn as nn
import torch.optim as optim

from data.utils.sampling import dataset_sampler

task_optimizer = optim.SGD(model.parameters(), lr=1e-1)
meta_optimizer = optim.AdamW(model.parameters(), lr=1e-3)

lossfn = nn.CrossEntropyLoss()

k = 4 # Number of shots
n_inner = 5 # Number of inner loop updates
n_outer = 10 # Number of outer loop updates before meta update

In [6]:
for i in range(n_outer):
    
    # A single episode
    # Set optimizer outside to 0
    meta_optimizer.zero_grad()

    # Sample a task
    source_name = dataset_sampler(unified, sampling_method='sqrt')

    # Get task dataloaders
    trainloader, testloader = unified.get_dataloader(source_name, k=k, tokenizer=tokenizer, shuffle=True)

    # Re-initialize the softmax layer
    # Can be informed (e.g. ProtoMAML, LEOPARD, etc.)
    n_classes = len(unified.label_map[source_name].keys())
    clf_layer = SF_CLF(n_classes=n_classes, hidden_dims=hidden_dims)

    print(f"Enter the innerloop for {source_name}, N={n_classes}")
    with higher.innerloop_ctx(model, task_optimizer, copy_initial_weights=False, track_higher_grads=False) as (fmodel, diffopt):
        
        # MAML support sets
        for ii in range(n_inner):
            # Sample batch
            batch = next(trainloader)
            labels, text, attn_mask = batch
            
            # Use higher to perform inner loop updates
            y = model(text, attn_mask)
            logits = clf_layer(y)
            inner_loss = lossfn(logits, labels)
            diffopt.step(inner_loss)

            print(f"\tInner {ii} | Loss={inner_loss.detach().tolist()}", flush=True)
        
        # MAML query set
        batch = next(trainloader)
        labels, text, attn_mask = batch
        
        y = model(text, attn_mask)
        logits = clf_layer(y)
        outer_loss = lossfn(logits, labels)
        outer_loss.backward()

        print(f"\tOuter   | Loss={outer_loss.detach().tolist()}", flush=True)

    meta_optimizer.step()


Enter the innerloop for grounded_emotions, N=2
	Inner 0 | Loss=0.6940103769302368
	Inner 1 | Loss=0.6929729580879211
	Inner 2 | Loss=0.6936208605766296
	Inner 3 | Loss=0.6938116550445557
	Inner 4 | Loss=0.6934021711349487
	Outer   | Loss=0.6941459774971008
Enter the innerloop for ssec, N=7
	Inner 0 | Loss=1.9461575746536255
	Inner 1 | Loss=1.9454056024551392
	Inner 2 | Loss=1.948883056640625
	Inner 3 | Loss=1.9445468187332153
	Inner 4 | Loss=1.947567343711853
	Outer   | Loss=1.9467065334320068
Enter the innerloop for tec, N=6
	Inner 0 | Loss=1.7907174825668335
	Inner 1 | Loss=1.7933412790298462
	Inner 2 | Loss=1.7930980920791626
	Inner 3 | Loss=1.7915347814559937
	Inner 4 | Loss=1.793237566947937
	Outer   | Loss=1.7948771715164185
Enter the innerloop for dailydialog, N=7
	Inner 0 | Loss=1.9495872259140015
	Inner 1 | Loss=1.9476367235183716
	Inner 2 | Loss=1.9457828998565674
	Inner 3 | Loss=1.9458239078521729
	Inner 4 | Loss=1.9495996236801147
	Outer   | Loss=1.9501999616622925
Enter th

# ProtoFoMAML Mock-up

In [6]:
import torch.nn as nn
import torch.optim as optim

from data.utils.sampling import dataset_sampler

k = 4 # Number of shots
n_inner = 5 # Number of inner loop updates
n_outer = 10 # Number of outer loop updates before meta update

In [7]:
# Step 1
# Set-up model + training

model = MetaBert(config)

meta_optimizer = optim.AdamW(model.parameters(), lr=1e-3)

lossfn = nn.CrossEntropyLoss()


In [8]:
# Load the data and embed
# This is with M_init

# Sample a task
source_name = dataset_sampler(unified, sampling_method='sqrt')

# Get task dataloaders
trainloader, testloader = unified.get_dataloader(source_name, k=k, tokenizer=tokenizer, shuffle=True)


In [9]:
# Step 2
# Generate separate model
from copy import deepcopy 

# Copy M_init to task-specific model
model_episode = deepcopy(model)
model_episode.zero_grad()

# Embed with init_model
batch = next(trainloader)
labels, text, attn_mask = batch

y_init = model(text, attn_mask)

In [10]:
# Step 3
# Generate prototypes and weight/biases from these
import torch

prototypes = torch.stack([torch.mean(y_init[labels == i], dim=0) for i, _ in enumerate(unified.label_map[source_name].keys())])

W = 2 * prototypes
b = -torch.norm(prototypes, p=2, dim=1)

In [11]:
# Step 4
# Set the weights and biases
W_output, b_output = W.detach(),  b.detach()
W_output.requires_grad, b_output.requires_grad = True, True

output_optimizer = optim.SGD([W_output, b_output], lr=1e-1)

In [12]:
# Step 5
# Episodic training
# Repeat G times
import torch.nn.functional as F

G = 5
for _ in range(G):
    # Load data
    batch = next(trainloader)
    labels, text, attn_mask = batch

    # Embed, encode, classify and compute loss
    y = model_episode(text, attn_mask)
    logits = F.linear(y, W_output, b_output)
    loss = lossfn(logits, labels)

    # Backprop the output parameters
    # Retrain graph for shared parameters
    W_output.grad, b_output.grad = torch.autograd.grad(loss, [W_output, b_output], retain_graph=True)

    # Backprop on shared parameters here    
    updateable_episode_params = [param for param in model_episode.parameters() if param.requires_grad]
    episode_grads = torch.autograd.grad(loss, updateable_episode_params)

    for param, grad in zip(updateable_episode_params, episode_grads):
        param.grad = grad

    # Update the parameters
    output_optimizer.step()
    task_optimizer.step()

    output_optimizer.zero_grad()
    task_optimizer.zero_grad()

In [13]:
# Step 6
# Unify the graphs

W_output = W + (W_output - W).detach()
b_output = b + (b_output - b).detach()

In [14]:
# Step 7
# Evaluate on query
batch = next(trainloader)
labels, text, attn_mask = batch

y = model_episode(text, attn_mask)
logits = F.linear(y, W_output, b_output)
loss = lossfn(logits, labels)

# Backprop on task-specific parameters here    
updateable_episode_params = [param for param in model_episode.parameters() if param.requires_grad]
episode_grads = torch.autograd.grad(loss, updateable_episode_params, retain_graph=True)

# Backprop on initial parameters here
updateable_model_params = [param for param in model.parameters() if param.requires_grad]
model_grads = torch.autograd.grad(loss, updateable_model_params)


In [15]:
# Step 8
# Sum the gradients
for param, g_episode, g_init in zip(updateable_model_params, episode_grads, model_grads):
    if param.grad == None:
        param.grad = g_episode + g_init
    else:
        param.grad += g_episode + g_init

In [16]:
# Outerloop update
# Update after a number of tasks
meta_optimizer.step()
meta_optimizer.zero_grad()

## Same but with nn.Linear
**TODO, ask Phillip how to init while part of compute graph**

In [17]:
# Copy to clf layer
clf = nn.Linear(model.out_dim, len(unified.label_map[source_name].keys()))
clf.weight.copy_(W.detach())
clf.bias.copy_(b.detach())

RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.

In [None]:
clf = nn.Linear(model.out_dim, len(unified.label_map[source_name].keys()))
with torch.no_grad():
    clf.weight.copy_(W)
    clf.bias.copy_(b)

In [66]:
clf.weight = 2 * W + (clf.weight - 2 * W).detach()

TypeError: cannot assign 'torch.FloatTensor' as parameter 'weight' (torch.nn.Parameter or None expected)

In [97]:
# Logits to predictions
probs = F.softmax(F.linear(y, W_output, b_output), dim=-1)
preds = torch.argmax(probs, dim=1)

# Accuracy
acc = (preds == labels).float().mean()

# Confusion matrix
conf_mat = torch.zeros(probs.size(-1), probs.size(-1))
for i, j in zip(preds, labels):
    conf_mat[i, j] += 1

# Precision / Recall
precision = torch.nan_to_num(torch.diagonal(conf_mat) / torch.sum(conf_mat, dim = 1))
recall = torch.nan_to_num(torch.diagonal(conf_mat) / torch.sum(conf_mat, dim = 0))

# F1
F1 = torch.nan_to_num(2 * (precision * recall) / (precision + recall))

# Macro agg 
F1_macro, F1_std = torch.mean(F1), torch.std(F1)

# Micro agg
