In [1]:
from copy import deepcopy 

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, get_constant_schedule_with_warmup

from models.seqtransformer import SeqTransformer
from data.unified_emotion import unified_emotion
from data.utils.sampling import dataset_sampler
from data.utils.tokenizer import manual_tokenizer
from utils.metrics import logging_metrics

###############################################
### HANDLE WITH ARGPARSER #####################
###############################################

###
### Model definition hyperparameters 
###
# Pre-trained model name (from Huggingface)
#bert_name = 'bert-base-uncased'
encoder_name = 'vinai/bertweet-base'
# Max to layer keep frozen. 11 keeps model frozen, -1 makes BERT totally trainable
nu = 10

# MLP hidden layers
hidden_dims = [512, 256]
# Which activation to use. Currently either tanh or ReLU
act_fn = 'tanh'

# Emulate config file
config = {'encoder_name': encoder_name, 
'nu': nu,
'hidden_dims': hidden_dims,
'act_fn': act_fn
}

###
### Meta-training definition hyper parameters 
###
k = 4 # Number of shots
n_inner = 7 # Number of inner loop updates
n_outer = 1 # Number of outer loop updates before meta update
n_episodes = 100

###
### Optimizer hyper parameters 
###
meta_lr = 1e-3
inner_lr = 1e-3
output_lr = 1e-3
warm_up_steps = 25

In [2]:
# Training only on grounded_emotions
excluded_tasks = ['dailydialog', 'crowdflower', 'tales-emotion', 'tec', 'emoint', 'fb-valence-arousal-anon', 'emobank', 'affectivetext', 'emotion-cause', 'electoraltweets', 'ssec', 'tales-emotions']

dataset = unified_emotion("./data/datasets/unified-dataset.jsonl", exclude=excluded_tasks)
dataset.prep(text_tokenizer=manual_tokenizer)

# Initialization and all that jazz

model = SeqTransformer(config)
tokenizer = AutoTokenizer.from_pretrained(encoder_name)

shared_optimizer = optim.SGD(model.parameters(), lr=meta_lr)
shared_lr_schedule = get_constant_schedule_with_warmup(shared_optimizer, warm_up_steps)

lossfn = nn.CrossEntropyLoss()


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [3]:
print(model)

SeqTransformer(
  (encoder): TransformerEncoder(
    (model): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(64001, 768, padding_idx=1)
        (position_embeddings): Embedding(130, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_features=76

In [19]:
source_name = dataset_sampler(dataset, sampling_method='sqrt')
support_loader, query_loader = dataset.get_dataloader(source_name, k=k, tokenizer=tokenizer, shuffle=True)
batch = next(support_loader)
labels, text, attn_mask = batch

query_batch = next(query_loader)
q_labels, q_text, q_attn_mask = query_batch

for i in range(n_episodes):
    
    for ii in range(n_outer):
        
        ##################
        # Step 1         #
        # Sample episode #
        ################## 
        # Sample a dataset
        source_name = dataset_sampler(dataset, sampling_method='sqrt')
        support_loader, query_loader = dataset.get_dataloader(source_name, k=k, tokenizer=tokenizer, shuffle=True)

        # Sample support set
        batch = next(query_loader)
        labels, text, attn_mask = batch

        ########################
        # Step 2               #
        # Copy into task-model #
        ######################## 
        # Copy shared model to task-specific model
        model_task = deepcopy(model)

        # Inner loop optimizer
        task_optimizer = optim.SGD(model_task.parameters(), lr=inner_lr)    

        #####################
        # Step 3            #
        # Prototype weights #
        #####################        
        # Embed with init_model
        y = model(text, attn_mask)

        # Generate initial classification weights and biases
        prototypes = torch.stack([torch.mean(y[labels == i], dim=0) for i in dataset.label_map[source_name].values()])

        W = 2 * prototypes
        b = -torch.norm(prototypes, p=2, dim=1)

        ######################
        # Step 4             #
        # Initialize out clf #
        ###################### 
        # Separate graph
        W_task, b_task = W.detach().clone(), b.detach().clone()
        W_task.requires_grad, b_task.requires_grad = True, True

        # Classifier optimizer
        output_optimizer = optim.SGD([W_task, b_task], lr=output_lr)

        ##############
        # Step 5     #
        # Inner loop #
        ############## 

        model.train()
        model_task.train()

        for iii in range(n_inner):
            # Load data
            #batch = next(support_loader)
            #labels, text, attn_mask = batch

            # Embed, encode, classify and compute loss
            y = model_task(text, attn_mask)
            logits = F.linear(y, W_task, b_task)
            loss = lossfn(logits, labels)

            # Backprop the output parameters
            # Retrain graph for shared parameters
            W_task.grad, b_task.grad = torch.autograd.grad(loss, [W_task, b_task], retain_graph=True)

            # Calculate the gradients on shared parameters here    
            updateable_task_params = [param for param in model_task.parameters() if param.requires_grad]
            task_grads = torch.autograd.grad(loss, updateable_task_params)

            # Store task-specific gradients
            for param, grad in zip(updateable_task_params, task_grads):
                param.grad = grad
            updateable_task_params = None

            # Update the parameters
            output_optimizer.step()
            task_optimizer.step()

            output_optimizer.zero_grad()
            task_optimizer.zero_grad()

            print("Episode {} | Task {}/{}, inner {} | Loss {:.4E}".format(i, ii+1, n_outer, iii, loss.detach().item()))
        
        #############
        # Step 6    #
        # Re-attach #
        ############# 
        W_task = 2 * prototypes + (W_task - 2 * prototypes).detach()
        b_task = -torch.norm(prototypes, p=2, dim=1) + (b_task + torch.norm(prototypes, p=2, dim=1)).detach()

        ########################
        # Step 7               #
        # Outer loop gradients #
        ########################
        model.eval()
        model_task.eval()

        # Load Query
        #query_batch = next(query_loader)
        #q_labels, q_text, q_attn_mask = query_batch

        # Push data through task-specific model
        y = model_task(q_text, q_attn_mask)
        logits = F.linear(y, W_task, b_task)
        loss = lossfn(logits, q_labels)

        # Calculate gradients for task-specific parameters    
        updateable_task_params = [param for param in model_task.parameters() if param.requires_grad]
        task_grads = torch.autograd.grad(loss, updateable_task_params, retain_graph=True)
        updateable_task_params = None

        # Calculate gradients for shared model parameters    
        updateable_model_params = [param for param in model.parameters() if param.requires_grad]
        model_grads = torch.autograd.grad(loss, updateable_model_params)

        ########################
        # Step 8               #
        # Accumulate gradients #
        ######################## 
        for param, g_task, g_init in zip(updateable_model_params, task_grads, model_grads):
            if param.grad == None:
                param.grad = g_task + g_init
            else:
                param.grad += g_task + g_init
        updateable_model_params = None

        # TODO: add logging for outer loop updates here
        n_classes = len(dataset.label_map[source_name].keys())
        mets = logging_metrics(logits.detach().cpu(), labels.detach().cpu())
        print("Episode {} | Task {}/{}: {:<20s}, N={} | Loss {:.4E}, Acc {:5.2f}, F1 {:5.2f}".format(i, ii+1, n_outer, source_name, n_classes, loss.detach().item(), mets['acc']*100, mets['f1']*100))

    print("Episode {} finished.\n".format(i))

    #####################
    # Step 9            #
    # Outer loop update #
    ##################### 

    shared_optimizer.step()
    shared_lr_schedule.step()
    
    shared_optimizer.zero_grad()


 , N=2 | Loss 6.9099E-01, Acc 75.00, F1 73.33
Episode 55 finished.

Episode 56 | Task 1/1, inner 0 | Loss 6.7893E-01
Episode 56 | Task 1/1, inner 1 | Loss 6.8359E-01
Episode 56 | Task 1/1, inner 2 | Loss 6.8054E-01
Episode 56 | Task 1/1, inner 3 | Loss 6.8167E-01
Episode 56 | Task 1/1, inner 4 | Loss 6.8424E-01
Episode 56 | Task 1/1, inner 5 | Loss 6.8238E-01
Episode 56 | Task 1/1, inner 6 | Loss 6.8242E-01
Episode 56 | Task 1/1: grounded_emotions   , N=2 | Loss 6.9180E-01, Acc 62.50, F1 56.36
Episode 56 finished.

Episode 57 | Task 1/1, inner 0 | Loss 6.8721E-01
Episode 57 | Task 1/1, inner 1 | Loss 6.8468E-01
Episode 57 | Task 1/1, inner 2 | Loss 6.8599E-01
Episode 57 | Task 1/1, inner 3 | Loss 6.8688E-01
Episode 57 | Task 1/1, inner 4 | Loss 6.8759E-01
Episode 57 | Task 1/1, inner 5 | Loss 6.8682E-01
Episode 57 | Task 1/1, inner 6 | Loss 6.8621E-01
Episode 57 | Task 1/1: grounded_emotions   , N=2 | Loss 6.9288E-01, Acc 62.50, F1 61.90
Episode 57 finished.

Episode 58 | Task 1/1, inn

In [9]:
for iii in range(n_inner):
    # Load data
    #batch = next(support_loader)
    #labels, text, attn_mask = batch

    # Embed, encode, classify and compute loss
    y = model_task(text, attn_mask)
    logits = F.linear(y, W_task, b_task)
    loss = lossfn(logits, labels)

    # Backprop the output parameters
    # Retrain graph for shared parameters
    W_task.grad, b_task.grad = torch.autograd.grad(loss, [W_task, b_task], retain_graph=True)

    # Calculate the gradients on shared parameters here    
    updateable_task_params = [param for param in model_task.parameters() if param.requires_grad]
    task_grads = torch.autograd.grad(loss, updateable_task_params)

KeyboardInterrupt: 

In [18]:
y[labels == torch.unique(labels).unsqueeze(1)]

IndexError: The shape of the mask [2, 8] at index 0 does not match the shape of the indexed tensor [8, 1, 256] at index 0

# Class definitions

In [1]:
from copy import deepcopy 
from collections import defaultdict

import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, get_constant_schedule_with_warmup

from models.protomaml_seqtransformer import ProtoMAMLSeqTransformer
from data.unified_emotion import unified_emotion
from data.utils.sampling import dataset_sampler
from data.utils.tokenizer import manual_tokenizer
from utils.metrics import logging_metrics

def meta_evaluate(model, dataset, tokenizer, config, k=16):
    """
    Check model performance on all datasets.
    DO NOT CALL with torch.no_grad(). THIS IS HANDLED INSIDE.

    Args:
        model: model currently being trained
        dataset: current dataset
        tokenizer (AutoTokenizer): Huggingface's tokenizer to match the model
        config (dict): training config dictionary
        k (int, optional): size of the k-shot. Defaults to 16.

    Returns:
        dict: dictionary with metrics per task

    """
    
    model.eval()
    
    task_vals = defaultdict(dict)
    
    for task in dataset.lens.keys():
        
        task_loss, task_acc, task_f1 = [], [], []
        for i in range(config['n_eval_per_task']):
            
            sample_loss, sample_acc, sample_f1 = [], [], []

            support_loader, query_loader = dataset.get_dataloader(task, k=k, tokenizer=tokenizer, shuffle=True)

            # Inner loop
            # Support set
            batch = next(support_loader)
            labels, text, attn_mask = batch

            #model.train()
            model.adapt(labels, text, attn_mask, task_name=task)

            with torch.no_grad():
                for ii in range(config['n_eval_per_support']):
                    query_batch = next(query_loader)
                    q_labels, q_text, q_attn_mask = query_batch

                    logits = model(q_text, q_attn_mask)
                    loss = model.lossfn(logits, q_labels)

                    mets = logging_metrics(logits.detach().cpu(), q_labels.detach().cpu())

                    sample_loss.append(loss.item())
                    sample_acc.append(mets['acc'] * 100)
                    sample_f1.append(mets['f1'] * 100)

            task_loss.append(np.mean(sample_loss))
            task_acc.append(np.mean(sample_acc))
            task_f1.append(np.mean(sample_f1))
            #print('Task {:}: {:}/{:} | Loss {:.4E}, Acc {:5.2f}, F1 {:5.2f}'.format(task, i+1, config['n_eval_per_task'], \
            #    task_loss[-1], task_acc[-1], task_f1[-1]))

        print(u"Eval | Task {:} | Loss {:.2E} \u00B1 {:.2E}, Acc {:5.2f} \u00B1 {:4.2f}, F1 {:5.2f} \u00B1 {:4.2f}".format(task, \
            np.mean(task_loss), np.std(task_loss), np.mean(task_acc), np.std(task_acc), np.mean(task_f1), np.std(task_f1)))
        
        task_vals['loss'][task] = np.mean(task_loss)
        task_vals['acc'][task] = np.mean(task_acc)
        task_vals['f1'][task] = np.mean(task_f1)

    return task_vals


In [2]:
###############################################
### HANDLE WITH ARGPARSER #####################
###############################################

# Emulate config file
config = {'encoder_name': 'bert-base-uncased', 
'nu': 5,
'hidden_dims': [256, 128],
'act_fn': 'Tanh',
'n_inner': 7,
'n_outer': 1,
'max_episodes': 2500,
'min_episodes': 250,
'patience': 2,
'inner_lr': 1e-3,
'output_lr': 1e-3,
'meta_lr': 1e-3,
'warm_up_steps': 250,
'n_eval_per_task': 10,
'n_eval_per_support': 1,
'checkpoint_path': './checkpoints/ProtoMAML',
'version': 'grounded_emotions_test',
'include': ['grounded_emotions'],
'k': 16,
'eval_every_n': 50
}

config['lossfn'] = nn.CrossEntropyLoss

In [3]:
# Make sure the right directory structure exists
log_dir = os.path.join(config['checkpoint_path'], config['version'])
os.makedirs(log_dir, exist_ok=True)
os.makedirs(os.path.join(log_dir, 'tensorboard'), exist_ok=True)
os.makedirs(os.path.join(log_dir, 'checkpoint'), exist_ok=True)
print(f"Saving models and logs to {log_dir}")

# Build the tensorboard writer
writer = SummaryWriter(os.path.join(log_dir, 'tensorboard'))

# Load in the data
dataset = unified_emotion("./data/datasets/unified-dataset.jsonl", include=config['include'])
dataset.prep(text_tokenizer=manual_tokenizer)

# Initialization of model
model = ProtoMAMLSeqTransformer(config)

#checkpoint_name = "ProtoMAML_episode-{:}_macrof1-{:5.2f}".format(9, 33.50)
#torch.save(model.state_dict(), './checkpoints/ProtoMAML/unified_test/checkpoint/' + checkpoint_name )
#model.load_state_dict(torch.load('./checkpoints/ProtoMAML/unified_test/checkpoint/' + checkpoint_name))

# Huggingface tokenizer
tokenizer = AutoTokenizer.from_pretrained(config['encoder_name'])

# Meta optimizers
shared_optimizer = optim.SGD(model.model_shared.parameters(), lr=config['meta_lr'])
shared_lr_schedule = get_constant_schedule_with_warmup(shared_optimizer, config['warm_up_steps'])


Saving models and logs to ./checkpoints/ProtoMAML\grounded_emotions_test


In [5]:
# Meta-evaluate prior to training for decent baseline
meta_eval = meta_evaluate(model, dataset, tokenizer, config, k=config['k'])

macro_f1 = np.mean(list(meta_eval['f1'].values()))

writer.add_scalars('Loss/MetaEval', meta_eval['loss'], 0)
writer.add_scalars('Accuracy/MetaEval', meta_eval['acc'], 0)
writer.add_scalars('F1/MetaEval', meta_eval['f1'], 0)
writer.add_scalar('MacroF1/MetaEval', 0)

best_macro_f1 = macro_f1

curr_patience = config['patience']

for episode in range(1, config['max_episodes']+1):
    ############
    # Training #
    ############
    for ii in range(config['n_outer']):

        source_name = dataset_sampler(dataset, sampling_method='sqrt')
        support_loader, query_loader = dataset.get_dataloader(source_name, k=config['k'], tokenizer=tokenizer, shuffle=True)

        # Inner loop
        # Support set
        batch = next(query_loader)
        labels, text, attn_mask = batch

        model.train()
        model.adapt(labels, text, attn_mask, task_name=source_name)

        # Outer loop
        # Query set
        query_batch = next(query_loader)
        q_labels, q_text, q_attn_mask = query_batch

        model.eval()
        logits = model(q_text, q_attn_mask)
        loss = model.lossfn(logits, q_labels)

        model.backward(loss)

        # Logging
        n_classes = len(dataset.label_map[source_name].keys())
        with torch.no_grad():
            mets = logging_metrics(logits.detach().cpu(), q_labels.detach().cpu())
        print("Train | Episode {} | Task {}/{}: {:<20s}, N={} | Loss {:.4E}, Acc {:5.2f}, F1 {:5.2f}".format(episode, ii+1, \
            config['n_outer'], source_name, n_classes, loss.detach().item(), mets['acc']*100, mets['f1']*100))

        writer.add_scalars('Loss/Train', {source_name: loss.detach().item()}, episode)
        writer.add_scalars('Accuracy/Train', {source_name: mets['acc']*100}, episode)
        writer.add_scalars('F1/Train', {source_name: mets['f1']*100}, episode)

    shared_optimizer.step()
    shared_lr_schedule.step()
    shared_optimizer.zero_grad()
    
    ##############
    # Evaluation #
    ##############
    if (episode % config['eval_every_n'])== 0:

        meta_eval = meta_evaluate(model, dataset, tokenizer, config, k=config['k'])

        macro_f1 = np.mean(list(meta_eval['f1'].values()))

        writer.add_scalars('Loss/MetaEval', meta_eval['loss'], episode)
        writer.add_scalars('Accuracy/MetaEval', meta_eval['acc'], episode)
        writer.add_scalars('F1/MetaEval', meta_eval['f1'], episode)
        writer.add_scalar('MacroF1/MetaEval', episode+1)

        if macro_f1 > best_macro_f1:
            save_name = f"episode-{episode+1}_macrof1-{macro_f1}"
            torch.save(model.state_dict(), os.path.join(log_dir, 'checkpoint', save_name))
            
            print(f"Saving model as {save_name}\nNew best macrof1={best_macro_f1}")
            best_macro_f1 = macro_f1
            curr_patience = config['patience']
        
        else:
            print(f"Model did not improve with macrof1={macro_f1}")
            if episode > config['min_episodes']:
                curr_patience -= 1

        print('')

        if curr_patience < 0:
            print("Stopping early.")
            break 

6.67
Train | Episode 13 | Task 1/1: grounded_emotions   , N=2 | Loss 6.9448E-01, Acc 50.00, F1 33.33
Train | Episode 14 | Task 1/1: grounded_emotions   , N=2 | Loss 6.8656E-01, Acc 53.12, F1 43.86
Train | Episode 15 | Task 1/1: grounded_emotions   , N=2 | Loss 7.0271E-01, Acc 50.00, F1 33.33
Train | Episode 16 | Task 1/1: grounded_emotions   , N=2 | Loss 7.0381E-01, Acc 50.00, F1 38.16
Train | Episode 17 | Task 1/1: grounded_emotions   , N=2 | Loss 6.9342E-01, Acc 50.00, F1 38.16
Train | Episode 18 | Task 1/1: grounded_emotions   , N=2 | Loss 6.9851E-01, Acc 50.00, F1 38.16
Train | Episode 19 | Task 1/1: grounded_emotions   , N=2 | Loss 7.0027E-01, Acc 46.88, F1 36.37
Train | Episode 20 | Task 1/1: grounded_emotions   , N=2 | Loss 6.9176E-01, Acc 53.12, F1 51.95
Train | Episode 21 | Task 1/1: grounded_emotions   , N=2 | Loss 6.9138E-01, Acc 59.38, F1 51.35
Train | Episode 22 | Task 1/1: grounded_emotions   , N=2 | Loss 6.8932E-01, Acc 53.12, F1 49.10
Train | Episode 23 | Task 1/1: grou

KeyboardInterrupt: 

In [7]:
source_name = dataset_sampler(dataset, sampling_method='sqrt')
_, query_loader = dataset.get_dataloader(source_name, k=k, tokenizer=tokenizer, shuffle=True)
query_batch = next(query_loader)
q_labels, q_text, q_attn_mask = query_batch

In [12]:
model.adapt(q_labels, q_text, q_attn_mask, verbose=True)

	Inner 0 | Loss 6.1790E-01
	Inner 1 | Loss 5.9264E-01
	Inner 2 | Loss 5.5335E-01
	Inner 3 | Loss 4.8858E-01
	Inner 4 | Loss 4.4266E-01
	Inner 5 | Loss 4.2068E-01
	Inner 6 | Loss 3.9739E-01


Task grounded_emotions: 0/1 | Loss 9.1800E-01, Acc 50.00, F1 33.33
Task grounded_emotions | Loss 9.18E-01 ± 0.00E+00, Acc 50.00 ± 0.00, F1 33.33 ± 0.00


defaultdict(dict,
            {'loss': {'grounded_emotions': 0.9180004954338074},
             'acc': {'grounded_emotions': 50.0},
             'f1': {'grounded_emotions': 33.33333432674408}})

In [5]:
source_name = dataset_sampler(dataset, sampling_method='sqrt')
_, query_loader = dataset.get_dataloader(source_name, k=k, tokenizer=tokenizer, shuffle=True)
query_batch = next(query_loader)
q_labels, q_text, q_attn_mask = query_batch

for episode in range(n_episodes):
    for ii in range(n_outer):

        source_name = dataset_sampler(dataset, sampling_method='sqrt')
        support_loader, query_loader = dataset.get_dataloader(source_name, k=k, tokenizer=tokenizer, shuffle=True)

        # Inner loop
        # Support set
        batch = next(query_loader)
        labels, text, attn_mask = batch

        #model.train()
        model.eval()
        model.adapt(labels, text, attn_mask, task_name=source_name, verbose=True)

        # Outer loop
        # Query set
        query_batch = next(query_loader)
        q_labels, q_text, q_attn_mask = query_batch

        model.eval()
        logits = model(q_text, q_attn_mask)
        loss = model.lossfn(logits, q_labels)

        model.backward(loss)

        # Logging
        n_classes = len(dataset.label_map[source_name].keys())
        with torch.no_grad():
            mets = logging_metrics(logits.detach().cpu(), q_labels.detach().cpu())
        print("Episode {} | Task {}/{}: {:<20s}, N={} | Loss {:.4E}, Acc {:5.2f}, F1 {:5.2f}".format(episode, ii+1, \
            n_outer,source_name, n_classes, loss.detach().item(), mets['acc']*100, mets['f1']*100))

        shared_optimizer.step()
        shared_lr_schedule.step()
        shared_optimizer.zero_grad()

ss 1.6555E-01
	Inner 6 | Loss 1.1413E-01
Episode 178 | Task 1/1: grounded_emotions   , N=2 | Loss 1.4232E-02, Acc 100.00, F1 100.00
	Inner 0 | Loss 1.1620E+00
	Inner 1 | Loss 9.6685E-01
	Inner 2 | Loss 9.2296E-01
	Inner 3 | Loss 5.0415E-01
	Inner 4 | Loss 4.3550E-01
	Inner 5 | Loss 3.9371E-01
	Inner 6 | Loss 3.5230E-01
Episode 179 | Task 1/1: grounded_emotions   , N=2 | Loss 3.3417E-02, Acc 100.00, F1 100.00
	Inner 0 | Loss 1.9479E+00
	Inner 1 | Loss 1.9334E+00
	Inner 2 | Loss 6.9830E+00
	Inner 3 | Loss 5.2546E+00
	Inner 4 | Loss 3.2245E+00
	Inner 5 | Loss 6.7004E-01
	Inner 6 | Loss 4.0514E-01
Episode 180 | Task 1/1: grounded_emotions   , N=2 | Loss 6.4351E-02, Acc 96.88, F1 96.87
	Inner 0 | Loss 1.9669E+00
	Inner 1 | Loss 3.3779E+00
	Inner 2 | Loss 2.3926E+00
	Inner 3 | Loss 8.3337E-01
	Inner 4 | Loss 6.4106E-01
	Inner 5 | Loss 5.0155E-01
	Inner 6 | Loss 4.2214E-01
Episode 181 | Task 1/1: grounded_emotions   , N=2 | Loss 1.4522E-02, Acc 100.00, F1 100.00
	Inner 0 | Loss 1.3844E+00
	In

In [36]:
meta_eval = meta_evaluate(model, dataset, tokenizer, config, k=16)

Task grounded_emotions: 0/10 | Loss 7.7724E-01, Acc 66.88, F1 66.34
Task grounded_emotions: 1/10 | Loss 6.2712E-01, Acc 64.38, F1 64.09
Task grounded_emotions: 2/10 | Loss 7.2218E-01, Acc 60.62, F1 58.37
Task grounded_emotions: 3/10 | Loss 6.9894E-01, Acc 64.38, F1 63.07
Task grounded_emotions: 4/10 | Loss 5.9582E-01, Acc 67.50, F1 67.05
Task grounded_emotions: 5/10 | Loss 5.8342E-01, Acc 70.62, F1 70.38
Task grounded_emotions: 6/10 | Loss 6.7413E-01, Acc 66.25, F1 66.04
Task grounded_emotions: 7/10 | Loss 6.7213E-01, Acc 66.88, F1 66.77
Task grounded_emotions: 8/10 | Loss 5.9813E-01, Acc 72.50, F1 72.30
Task grounded_emotions: 9/10 | Loss 5.9930E-01, Acc 68.12, F1 67.95
Task grounded_emotions | Loss 6.55E-01 ± 6.15E-02, Acc 66.81 ± 3.16, F1 66.24 ± 3.67


In [56]:
import os
import argparse
from collections import defaultdict

import numpy as np


# Train

checkpoint_path = './checkpoints/ProtoMAML/unified_test'

# Create the correct directory structure
os.makedirs(checkpoint_path, exist_ok=True)
os.makedirs(os.join.path(checkpoint_path, 'tensorboard'), exist_ok=True)
os.makedirs(os.join.path(checkpoint_path, 'checkpoint'), exist_ok=True)

# Start the tensorboard logger
writer = SummaryWriter(os.join.path(checkpoint_path, 'tensorboard'))

macro_f1 = np.mean(list(meta_eval['f1'].values()))

writer.add_scalars('Loss/MetaEval', meta_eval['loss'], i)
writer.add_scalars('Accuracy/MetaEval', meta_eval['acc'], i)
writer.add_scalars('F1/MetaEval', meta_eval['f1'], i)
writer.add_scalar('MacroF1/MetaEval', , i)

writer.close()

33.50312037229538

<All keys matched successfully>

In [None]:
model

In [82]:
import re
nu = 10

for p in model.model_shared.named_parameters():
    transformer_layer = re.search("(?:encoder\.layer\.)([0-9]+)", p[0])
    if transformer_layer and (int(transformer_layer.group(1)) > nu):
        print(p[0], p[1].requires_grad)
    elif 'pooler' in p[0]:
        print(p[0], p[1].requires_grad)

encoder.model.encoder.layer.11.attention.self.query.weight False
encoder.model.encoder.layer.11.attention.self.query.bias False
encoder.model.encoder.layer.11.attention.self.key.weight False
encoder.model.encoder.layer.11.attention.self.key.bias False
encoder.model.encoder.layer.11.attention.self.value.weight False
encoder.model.encoder.layer.11.attention.self.value.bias False
encoder.model.encoder.layer.11.attention.output.dense.weight False
encoder.model.encoder.layer.11.attention.output.dense.bias False
encoder.model.encoder.layer.11.attention.output.LayerNorm.weight False
encoder.model.encoder.layer.11.attention.output.LayerNorm.bias False
encoder.model.encoder.layer.11.intermediate.dense.weight False
encoder.model.encoder.layer.11.intermediate.dense.bias False
encoder.model.encoder.layer.11.output.dense.weight False
encoder.model.encoder.layer.11.output.dense.bias False
encoder.model.encoder.layer.11.output.LayerNorm.weight False
encoder.model.encoder.layer.11.output.LayerNorm.bia

In [5]:
for p in model.model_shared.named_parameters():
    print(p[0], p[1].requires_grad)

encoder.model.embeddings.word_embeddings.weight False
encoder.model.embeddings.position_embeddings.weight False
encoder.model.embeddings.token_type_embeddings.weight False
encoder.model.embeddings.LayerNorm.weight False
encoder.model.embeddings.LayerNorm.bias False
encoder.model.encoder.layer.0.attention.self.query.weight False
encoder.model.encoder.layer.0.attention.self.query.bias False
encoder.model.encoder.layer.0.attention.self.key.weight False
encoder.model.encoder.layer.0.attention.self.key.bias False
encoder.model.encoder.layer.0.attention.self.value.weight False
encoder.model.encoder.layer.0.attention.self.value.bias False
encoder.model.encoder.layer.0.attention.output.dense.weight False
encoder.model.encoder.layer.0.attention.output.dense.bias False
encoder.model.encoder.layer.0.attention.output.LayerNorm.weight False
encoder.model.encoder.layer.0.attention.output.LayerNorm.bias False
encoder.model.encoder.layer.0.intermediate.dense.weight False
encoder.model.encoder.layer.0.

In [21]:
protomaml_model = ProtoMAMLSeqTransformer(config)

In [22]:
y = protomaml_model(text, attn_mask)

In [23]:
[torch.mean(y[labels == i], dim=0) for i in dataset.label_map[source_name].values()]

[tensor([-0.0880, -0.0837, -0.0354,  0.0504, -0.0133, -0.0463,  0.0563,  0.0804,
         -0.0108, -0.0040,  0.1059,  0.0320, -0.0608, -0.0685, -0.0003,  0.0337,
          0.0723,  0.0244, -0.0407,  0.0020, -0.0197,  0.0062, -0.0267, -0.0110,
          0.0636, -0.0142, -0.0067, -0.0275, -0.0701,  0.0087,  0.0288,  0.0049,
          0.0486, -0.0030,  0.0411, -0.0301, -0.0305, -0.0353, -0.0668, -0.0309,
         -0.0090,  0.0304,  0.0208, -0.0100, -0.0608,  0.0575, -0.0426,  0.0170,
         -0.0272,  0.0239,  0.0743, -0.0018, -0.0092,  0.0623,  0.0986,  0.0686,
         -0.0170, -0.0061, -0.0131,  0.0352,  0.0503, -0.0456,  0.0461, -0.0134,
          0.0292,  0.0660,  0.0106,  0.0080,  0.0227, -0.0028,  0.0255, -0.0472,
          0.0935,  0.0673, -0.1418,  0.0963,  0.0099,  0.0062,  0.0724,  0.0183,
          0.0222, -0.1021, -0.0392,  0.0616,  0.0392,  0.0497, -0.0222,  0.0508,
         -0.0213, -0.0316, -0.0310, -0.0713,  0.0098, -0.0403,  0.0167, -0.0409,
          0.0767, -0.0660,  