## INITIALIZE LOGGER

In [6]:
import wandb
wandb.login(key="3db31cd19d063689e924d07069de6c7a1670642b") 

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## IMPORTS

In [5]:
import os
import json
import math
from collections import OrderedDict
import torch
from torch import nn, Tensor
from typing import Union, Tuple, List, Iterable, Dict
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.optim import AdamW
from torch.utils.data import DataLoader
import numpy as np
import gzip, csv
import pandas as pd
from tqdm.auto import tqdm
import torch.nn.init as init
from sklearn.metrics import f1_score
import random
torch.manual_seed(0)
np.random.seed(0)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch import nn, Tensor
import torch.nn.functional as F
from torch.utils.data import Dataset
from avalanche.benchmarks.utils import AvalancheDataset
from avalanche.benchmarks.generators import nc_benchmark
from transformers import BertTokenizer
from datasets import load_dataset
import re
from avalanche.benchmarks.classic import SplitMNIST
from avalanche.models import MTSimpleMLP
from avalanche.training.supervised import EWC
from avalanche.evaluation.metrics import forgetting_metrics, accuracy_metrics
from avalanche.logging import InteractiveLogger
from avalanche.training.plugins import EvaluationPlugin
from avalanche.evaluation.metrics import forgetting_metrics, accuracy_metrics, \
    loss_metrics, timing_metrics, cpu_usage_metrics, confusion_matrix_metrics, disk_usage_metrics
from avalanche.training.templates import SupervisedTemplate
from avalanche.training.plugins import ReplayPlugin, EWCPlugin
from avalanche.logging import WandBLogger
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

ModuleNotFoundError: No module named 'avalanche'

## ACTIVATION FUNCTION

In [9]:
def gelu(x):
    """Implementation of the gelu activation function."""
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


## POSTIONAL ENCODING LAYER

In [10]:
class PositionalEncoding(nn.Module):

    def __init__(self, embed_dim: int, drop_rate=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=drop_rate)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe = torch.zeros(1, max_len, embed_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        
        Returns:
            torch.Tensor: Output tensor after adding positional encodings and applying dropout.
                 It has the same shape as the input tensor [batch_size, seq_len, embedding_dim].
                 The positional encodings are added to the input tensor along the sequence length dimension,
                 and dropout is applied to the combined tensor.
        
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

## ATTENTION MECHANISM

In [11]:
def scaled_dot_product(q, k, v, attn_drop_rate=0.1):
    """
    Args:
      q: query, shape: (batch, # heads, seq len, head dimension)
      k: keys, shape: (batch, # heads, seq len, head dimension)
      v: value, shape: (batch, # heads, seq len, head dimension)
      attn_drop_rate: probability of an element to be zeroed,
      mask: the optional masking of specific entries in the attention matrix.
              shape: (batch, seq len)
    
     Returns:
        torch.Tensor: Output tensor after scaled dot product attention computation.
           Shape: (batch, # heads, seq len, head dimension).
    
    """
    d_k = q.shape[-1]
    attn_logits = torch.matmul(q, k.transpose(-1, -2))
    attn_logits = attn_logits/math.sqrt(d_k)
    attention = F.softmax(attn_logits, dim=-1)
    attention = F.dropout(attention, p=attn_drop_rate)
    values = torch.matmul(attention,v)
    return values

### MULTI HEAD SELF ATTENTION

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, n_heads, attn_drop_rate):
        super().__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.head_dim = embed_dim // n_heads
        self.attn_drop_rate = attn_drop_rate
        self.query = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.key = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.value = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.o_proj = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self._reset_parameters()

    def _reset_parameters(self):
      nn.init.xavier_uniform_(self.query.weight)
      self.query.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.key.weight)
      self.key.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.value.weight)
      self.value.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.o_proj.weight)
      self.o_proj.bias.data.fill_(0)

    def split_heads(self, tensor):
       new_shape = tensor.size()[:-1] + (self.n_heads, self.head_dim)
       tensor = tensor.view(*new_shape)
       tensor = tensor.permute(0, 2, 1, 3).contiguous()
       return tensor

    def merge_heads(self, tensor, batch_size, seq_length):
       tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_length, self.embed_dim)
       return tensor

    def forward(self, embedding):
      """
       Args:
        embedding (torch.Tensor): 
            A tensor of shape (batch_size, seq_length, embed_dim) representing the input embeddings.
            - `batch_size`: The number of samples in the batch.
            - `seq_length`: The number of tokens (or time steps) in each sequence.
            - `embed_dim`: The dimension of the embedding for each token.
       
       Returns:
        torch.Tensor: 
            A tensor of shape (batch_size, seq_length, embed_dim) representing the attended embeddings.
            - `batch_size`: The number of samples in the batch.
            - `seq_length`: The number of tokens (or time steps) in each sequence.
            - `embed_dim`: The dimension of the embedding for each token.
      
      """
      batch_size, seq_length, embed_dim = embedding.size()
      q, k, v = self.query(embedding), self.key(embedding), self.value(embedding)
      q = self.split_heads(q)
      k = self.split_heads(k)
      v = self.split_heads(v)
      values = scaled_dot_product(q, k, v, self.attn_drop_rate)
      values = self.merge_heads(values, batch_size, seq_length)
      attended_embeds = self.o_proj(values)
      return attended_embeds

### MULTI HEAD CROSS ATTENTION

In [13]:
class MultiHeadCrossAttention(nn.Module):  
    def __init__(self, embed_dim, n_heads, attn_drop_rate):
        super().__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.head_dim = embed_dim // n_heads
        self.key = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.value = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.query = nn.Linear(self.embed_dim , embed_dim)
        self.linear_layer = nn.Linear(embed_dim, embed_dim)
        self.attn_drop_rate = attn_drop_rate
   
    def forward(self, x, catx): 
         """
            Args:
             catx (torch.Tensor): 
                 A tensor of shape (batch_size, seq_length, embed_dim) representing the input embeddings of orignal sequence with additional context.
                 - `batch_size`: The number of samples in the batch.
                 - `seq_length`: The number of tokens (or time steps) in each sequence.
                 - `embed_dim`: The dimension of the embedding for each token.
            
             x (torch.Tensor): 
                 A tensor of shape (batch_size, seq_length, embed_dim) representing the input embeddings of orignal sequence.
                 - `batch_size`: The number of samples in the batch.
                 - `seq_length`: The number of tokens (or time steps) in each sequence.
                 - `embed_dim`: The dimension of the embedding for each token.     
                 
            Returns:
             torch.Tensor: 
                 A tensor of shape (batch_size, seq_length, embed_dim) representing the attended embeddings.
                 - `batch_size`: The number of samples in the batch.
                 - `seq_length`: The number of tokens (or time steps) in each sequence.
                 - `embed_dim`: The dimension of the embedding for each token.
        
        """
         qbatch_size, qsequence_length, qembeddings = x.size()
         batch_size, sequence_length, embeddings = catx.size()
         q, k, v = self.query(x), self.key(catx), self.value(catx)
         k = k.reshape(batch_size, sequence_length, self.n_heads, self.head_dim)  
         v = v.reshape(batch_size, sequence_length, self.n_heads, self.head_dim)
         q = q.reshape(qbatch_size, qsequence_length, self.n_heads, self.head_dim) 
         k = k.permute(0, 2, 1, 3) 
         v = v.permute(0, 2, 1, 3)
         q = q.permute(0, 2, 1, 3) 
         values = scaled_dot_product(q, k, v, self.attn_drop_rate) 
         values = values.reshape(qbatch_size, qsequence_length, embeddings)
         out = self.linear_layer(values)  
         return out 

## LAYER NORMALIZATION LAYER

In [14]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        """
         Args:
         inputs (Tensor): Input tensor to normalize.
         
         Returns:
                torch.Tensor: Normalized tensor after applying layer normalization.
                 It has the same shape as the input tensor `(batch_size, *parameters_shape)`.

        """
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out

## FEEDFORWARD LAYER

In [15]:
class PositionwiseFeedForward(nn.Module): 

    def __init__(self, embed_dim, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dim, 4*embed_dim)
        self.linear2 = nn.Linear(4*embed_dim, embed_dim)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        """
         Args:
             x (torch.Tensor): Input tensor to the feedforward network.
                 Its shape should be `(batch_size, sequence_length, embed_dim)`.
                 `batch_size` is the number of sequences in a batch,
                 `sequence_length` is the length of each sequence,
                 and `embed_dim` is the dimensionality of the input and output embeddings.
     
         Returns:
             torch.Tensor: Output tensor of the feedforward network.
                 It has the same shape as the input tensor `(batch_size, sequence_length, embed_dim)`.
        
        """
        x = self.linear1(x)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

## CLASSIFIER

In [16]:
class Classifier(nn.Module):
    def __init__(self, input_dim, numclasses, dropout_rate=0.1):
        super(Classifier, self).__init__()
        self.linear = nn.Linear(input_dim, numclasses) 

    def forward(self, x):
     """
        Args:
            x (torch.Tensor): Input tensor to the classifier.
                Its shape should be `()`.
                `batch_size` is the number of samples in the batch,
                and `input_dim` is the dimensionality of the input features.

        Returns:
            torch.Tensor: Output tensor representing the logits for each class.
                It has the shape `()`.
                `batch_size` is the number of samples in the batch,
                and `num_classes` is the number of classes in the classification task.
     """
     x = self.linear(x)
     return x

## ENCODER LAYER

In [17]:
class EncoderLayer(nn.Module):

    def __init__(self, embed_dim, n_heads, attn_drop_rate, layer_drop_rate):
        super(EncoderLayer, self).__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.crossattention = MultiHeadCrossAttention(self.embed_dim, self.n_heads, attn_drop_rate)
        self.attention = MultiHeadAttention(self.embed_dim, self.n_heads, attn_drop_rate)
        self.norm1 = LayerNormalization(parameters_shape=[self.embed_dim])
        self.dropout1 = nn.Dropout(p=layer_drop_rate)
        self.ffn = PositionwiseFeedForward(self.embed_dim,layer_drop_rate)
        self.norm2 = LayerNormalization(parameters_shape=[self.embed_dim])
        self.dropout2 = nn.Dropout(p=layer_drop_rate)

    def forward(self, catx, x=None, is_first=False):
        """
            Args:
            catx (torch.Tensor): Input tensor to the encoder layer.
                Its shape should be `(batch_size, seq_length, embed_dim)`.
                - `batch_size`: The number of samples in the batch.
                - `seq_length`: The number of tokens (or time steps) in each sequence.
                - `embed_dim`: The dimension of the embedding for each token.
    
            x (torch.Tensor): Input tensor to the first encoder layer.
                Its shape should be `(batch_size, seq_length, embed_dim)`.
                - `batch_size`: The number of samples in the batch.
                - `seq_length`: The number of tokens (or time steps) in each sequence.
                - `embed_dim`: The dimension of the embedding for each token.
                
            
            Returns:
                torch.Tensor: Output tensor representing the encoded representations.
                    It has the same shape as the input tensor `(batch_size, seq_length, embed_dim)`.
                    - `batch_size`: The number of samples in the batch.
                    - `seq_length`: The number of tokens (or time steps) in each sequence.
                    - `embed_dim`: The dimension of the embedding for each token.
            
        """
        if is_first:
            residual_x = x 
            cross_x = self.crossattention(x, catx)
            cross_x = self.dropout1(cross_x)
            cross_x = cross_x + residual_x
            cross_x = self.norm1(cross_x)
            residual_x = cross_x
            cross_x = self.ffn(cross_x)
            cross_x = self.dropout2(cross_x)
            cross_x = cross_x + residual_x
            cross_x = self.norm2(cross_x)
            return cross_x
        
        residual_x = catx
        catx = self.attention(catx)
        catx = self.dropout1(catx)
        catx = catx + residual_x
        catx = self.norm1(catx)
        residual_x = catx
        catx = self.ffn(catx)
        catx = self.dropout2(catx)
        catx = catx + residual_x
        catx = self.norm2(catx)
        return catx   
 


## META TRANSFORMER LAYER

In [18]:
class rememBERT(nn.Module):
    def __init__(self, n_layers, vocab_size, embed_dim, n_heads, num_classes, attn_drop_rate, layer_drop_rate, seq_len):
        super().__init__()
        self.embed = nn.Embedding(vocab_size+1, embed_dim)
        self.position = PositionalEncoding(embed_dim, layer_drop_rate)
        self.first_layer = EncoderLayer(embed_dim, n_heads, attn_drop_rate, layer_drop_rate)
        self.net = nn.Sequential(*[
        EncoderLayer(embed_dim, n_heads, attn_drop_rate, layer_drop_rate) for _ in range(n_layers-1)
        ])
        self.pooler = nn.Sequential(OrderedDict([
            ('dense', nn.Linear(embed_dim, embed_dim)),
            ('activation', nn.Tanh()),
        ]))
        self.classifier = Classifier(embed_dim, num_classes) 
        self.saved_sample = None
        self.seq_len = seq_len
    def forward(self, batch_text):
        """
             Args:
                 batch_text (torch.Tensor): Batch of input texts represented as token indices.
                     Its shape should be `(batch_size, seq_length)`.
             Returns:
                 torch.Tensor: Predicted logits for each class.
                     It has the shape `(batch_size, num_classes)`.
                     - `batch_size`: The number of samples in the batch.
                     - `num_classes`: The number of classes in the classification task.
        """
        if self.training:
            if self.saved_sample == None:
                    self.saved_sample = batch_text[:1]
            newtensor = []
            batch_text = batch_text.squeeze(1)
            for i in range(1):
                offset_x = torch.roll(batch_text, shifts=+i+1, dims=0)
            zerotensor = torch.zeros_like(batch_text[0])
            for i in range(len(batch_text)-1):
                if random.random() < 0.3:
                    newtensor.append(torch.cat((zerotensor,offset_x[i+1])))
                else:
                    newtensor.append(torch.cat((offset_x[i],offset_x[i+1])))
            newtensor.append(torch.cat((offset_x[-1],offset_x[0])))
            numpy_array = np.array([t.cpu().detach().numpy() for t in newtensor])
            newtensor = torch.Tensor(numpy_array)
            newtensor = newtensor.to(device).to(torch.int64)
            embedding = self.position(self.embed(newtensor)) 
            extracted_tensor = embedding[:, self.seq_len:, :]
            embedding = self.first_layer(embedding, extracted_tensor, True) 
        else:
            newtensor = []
            batch_text = batch_text.squeeze(1)
            for i in range(1):
                offset_xt = torch.roll(batch_text, shifts=+i+1, dims=0)
            for i in range(len(batch_text)-1): ## DATA LEAKAGE ?
                newtensor.append(torch.cat((offset_xt[i],offset_xt[i+1])))
            newtensor.append(torch.cat((offset_xt[-1],offset_xt[0])))
            newtensor = np.array([t.cpu().detach().numpy() for t in newtensor])
            newtensor = torch.Tensor(newtensor)
            newtensor = newtensor.to(device).to(torch.int64)
            embedding = self.position(self.embed(newtensor))
            textracted_tensor = embedding[:, self.seq_len:, :]
            embedding = self.first_layer(embedding, textracted_tensor, True)

        new_embedding = self.net((embedding))
        o = self.pooler(new_embedding[:, 0])
        preds = self.classifier(o)
        return preds


## DATALOADER & BENCHMARK

### CHOOSE DATASET & DEFINE SEQUENCE LENGTH

In [None]:
df = "asc"
seq_len = 256

In [35]:
import os
import json

if(df == 'dsc'):
    directory = '/kaggle/input/dsc-dataset/dat/dsc/'
    categories = ['Kindle_Store', 'Movies_and_TV', 'Musical_Instruments', 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies', 'Sports_and_Outdoors', 'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games']

    dsc_train_text = []
    dsc_train_labels = []
    for i in range(0, len(categories)):
        file_path = os.path.join(directory, categories[i])
        file_path = os.path.join(file_path, 'train.json')
        with open(file_path, 'r') as file:
            data = json.load(file)
            sentence_list = [value["sentence"] for key, value in data.items()]
            labels = [i] * len(sentence_list)
            dsc_train_text += sentence_list
            dsc_train_labels += labels


    dsc_test_text = []
    dsc_test_labels = []
    for i in range(0, len(categories)):
        file_path = os.path.join(directory, categories[i])
        file_path = os.path.join(file_path, 'test.json')
        with open(file_path, 'r') as file:
            data = json.load(file)
            sentence_list = [value["sentence"] for key, value in data.items()]
            labels = [i] * len(sentence_list)
            dsc_test_text += sentence_list
            dsc_test_labels += labels

    train_text = dsc_train_text
    train_labels = dsc_train_labels
    test_text = dsc_test_text
    test_labels = dsc_test_labels
elif(df == "news"):
    dataset = load_dataset("setfit/20_newsgroups")
    train_text = dataset['train']['text']
    train_labels = dataset['train']['label']
    test_text = dataset['test']['text']
    test_labels = dataset['test']['label']
elif(df == "asc"):
    directory3 = "/kaggle/input/dsc-dataset/dat/absa/Bing3Domains/asc"
    directory5 = "/kaggle/input/dsc-dataset/dat/absa/Bing5Domains/asc"
    directory9 = "/kaggle/input/dsc-dataset/dat/absa/Bing9Domains/asc"
    directory2 = "/kaggle/input/dsc-dataset/dat/absa/XuSemEval/asc/14/"
    
    categories3 = ["Computer", "Router", "Speaker"]
    categories5 = ["ApexAD2600Progressive", "CanonG3", "CreativeLabsNomadJukeboxZenXtra40GB", "NikonCoolpix4300", "Nokia6610"]
    categories9 = ["CanonPowerShotSD500", "CanonS100", "DiaperChamp", "HitachiRouter", "LinksysRouter", "MicroMP3", "Nokia6600", "Norton", "ipod"]
    categories2 = ["laptop"]
    
    directories = [directory3, directory5, directory9, directory2]
    categories = [categories3, categories5, categories9, categories2]
    
    asc_train_text = []
    asc_train_labels = []
    encoded_label = 0
    for i in range(0, len(directories)):
        for j in range(0, len(categories[i])):
            file_path = os.path.join(directories[i], categories[i][j])
            file_path = os.path.join(file_path, 'train.json')
            with open(file_path, 'r') as file:
                data = json.load(file)
                sentence_list = [value["sentence"] for key, value in data.items()]
                labels = [encoded_label] * len(sentence_list)
                asc_train_text += sentence_list
                asc_train_labels += labels
            encoded_label += 1

    asc_test_text = []
    asc_test_labels = []
    encoded_label = 0
    for i in range(0, len(directories)):
        for j in range(0, len(categories[i])):
            file_path = os.path.join(directories[i], categories[i][j])
            file_path = os.path.join(file_path, 'test.json')
            with open(file_path, 'r') as file:
                data = json.load(file)
                sentence_list = [value["sentence"] for key, value in data.items()]
                labels = [encoded_label] * len(sentence_list)
                asc_test_text += sentence_list
                asc_test_labels += labels
            encoded_label += 1

    train_text = asc_train_text
    train_labels = asc_train_labels
    test_text = asc_test_text
    test_labels = asc_test_labels

10241
19


In [32]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        """
        Args:
            texts (list of str): List of text samples.
            labels (list of int): List of labels corresponding to the text samples.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)

    @staticmethod
    def preprocess_text(text):
        # Lowercase the text
        text = text.lower()
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', ' ', text)
        # Remove emails
        text = re.sub(r'\S*@\S*\s?', ' ', text)
        # Remove special characters (keeping letters, numbers, and basic punctuation)
        text = re.sub(r'[^a-z0-9,.!? ]', ' ', text)
        return text

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text (you could add truncation and padding as needed)
        text = TextDataset.preprocess_text(text)
        encoded_text = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=seq_len)
        encoded_text = encoded_text["input_ids"]

        return encoded_text, label

if df == "asc":
    nexp = 6
else:
    nexp = 5


train_data = TextDataset(train_text, train_labels)
test_data = TextDataset(test_text, test_labels)

avl_train_data = AvalancheDataset(train_data)
avl_test_data = AvalancheDataset(test_data)


avl_train_data.targets = train_labels
avl_test_data.targets = test_labels

benchmark = nc_benchmark(
    test_dataset=avl_test_data,  
    train_dataset=avl_train_data,
    n_experiences=nexp,  
    task_labels=False  
)

train_stream = benchmark.train_stream
test_stream = benchmark.test_stream
experience = train_stream[0]

t_label = experience.task_label
dataset = experience.dataset

  avl_train_data = AvalancheDataset(train_data)
  avl_test_data = AvalancheDataset(test_data)


## CONFIG

In [21]:
embed_dim = 768
n_heads = 4
n_layers = 3
vocab_size = 30522
attn_drop_rate = 0.2
layer_drop_rate = 0.2
batch_size = 32
if df == "asc":
    num_classes = 18
elif df == "dsc":
    num_classes = 10   
elif df == "news":
    num_classes = 20
else:
    raise Exception
num_epochs = 5
model = rememBERT(n_layers, vocab_size, embed_dim, n_heads, num_classes, attn_drop_rate, layer_drop_rate, seq_len)
model = model.to(device)

def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, dim=1)
    correct = (predicted == labels).sum().item()  
    total = labels.size(0)
    accuracy = correct / total
    predicted = predicted.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    f1macro = f1_score(labels, predicted, average='macro')
    return accuracy, f1macro, predicted, labels



optimizer = AdamW(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

## TRAINING TESTING LOOP

In [21]:
interactive_logger = InteractiveLogger()
loggers=[]
loggers.append(InteractiveLogger())
#loggers.append(WandBLogger(project_name="sequential_meta_classifier", run_name="40e_0.0001lr_512dim_4h_3l_0.2adr_0.2ldr_AVL_COS"))
eval_plugin = EvaluationPlugin(
    accuracy_metrics(
        minibatch=False, epoch=True, experience=True, stream=True
    ),
    confusion_matrix_metrics(num_classes=benchmark.n_classes, save_image=False,
                             stream=True),
    forgetting_metrics(experience=True, stream=True),
    loggers=loggers,
)

replay = ReplayPlugin(mem_size=1000)
ewc = EWCPlugin(ewc_lambda=1)


cl_strategy = SupervisedTemplate(
    model=model, optimizer=optimizer,
    criterion=criterion, train_mb_size=batch_size, train_epochs=num_epochs, eval_mb_size=batch_size,
    evaluator=eval_plugin,device=device,plugins=[replay])



# strategy = EWC(
#     model=model,
#     optimizer=optimizer,
#     criterion=criterion,
#     train_mb_size=batch_size,
#     train_epochs=num_epochs,
#     eval_mb_size=batch_size,
#     device=device,
#     evaluator=eval_plugin,
#     ewc_lambda=0.4,
# )

NameError: name 'model' is not defined

In [None]:
for train_task in train_stream:
    cl_strategy.train(train_task)
    cl_strategy.eval(test_stream)