# Package Preparation

In [None]:
%pip install transformers==4.28.0
%pip install datasets evaluate
%pip install -U accelerate
%pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transfor

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import shutil
from timeit import default_timer
import random
from transformers import BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import tensorflow_hub as hub
import tensorflow as tf

# Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Change to your specific path.
# %cd "drive/Shareddrives/CS 685 Final Project"
%cd "/content/drive/MyDrive/Workspace/tagrec"

/content/drive/MyDrive/Workspace/tagrec


# Helper Functions

In [None]:
def recall_at_k(topk_indices, test_indices, verbose=False):  # topk_indices shape: m x k, test_indices: list len m.   
    # k is implicitly the 2nd dimension of topk_indices
    correctly_pred = 0
    num_instances = len(test_indices)
    for i in range(num_instances):
      if test_indices[i] in topk_indices[i]:
        correctly_pred += 1
    if verbose:
      print('correctly pred: ', correctly_pred, ', total # of data instances: ', num_instances)
    return correctly_pred, correctly_pred / num_instances

In [None]:
def precision_at_k(topk_indices, test_indices, verbose=False):  # topk_indices shape: m x k, test_indices: list len m.   
    # k is implicitly the 2nd dimension of topk_indices
    correctly_pred, _ = recall_at_k(topk_indices, test_indices)
    num_preds = len(test_indices) * topk_indices.shape[1]
    if verbose:
      print('correctly pred:', correctly_pred, ', total # of preds: ', num_preds)
    return correctly_pred / num_preds

In [None]:
def get_cleaned_taxonomy(taxonomy, delimiter):
    cleaned_taxonomy = []
    for value in taxonomy:
        value = ' '.join(value.lower().split(delimiter))
        cleaned_taxonomy.append(value)
    return cleaned_taxonomy

# Random Setup

In [None]:
random_seed = 48

np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
random.seed(random_seed)

# Model Definition

## Original TagRec Model

In [None]:
class TagRecModel:
    def __init__(self, train_set_path=None, val_set_path=None, test_set_path=None, test_data_path=None, label_col_name=None):
        # training set
        if train_set_path is not None:
            self.train_set = torch.load(train_set_path)
        else: 
            self.train_set = None
        
        # validation set
        if val_set_path is not None:
            self.val_set = torch.load(val_set_path)
        else: 
            self.val_set = None
        
        # testing set
        if test_set_path is not None:
            self.test_set = torch.load(test_set_path)
        else:
            self.test_set = None
        
        self.LE = LabelEncoder()  # label encoder that encodes labels to value between 0 and n_classes-1
        self.test_unique_labels = None  # all unique labels
        if test_data_path is not None and label_col_name is not None:
            test_data = pd.read_csv(test_data_path)
            test_labels = test_data[label_col_name]
            self.test_unique_labels = test_labels.unique()
            self.LE.fit_transform(self.test_unique_labels)
        
        self.training_stats = []
        
        # Use GPU if possible
        if torch.cuda.is_available():      
            self.device = torch.device("cuda")
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('Using GPU:', torch.cuda.get_device_name(0))
        else:
            self.device = torch.device("cpu")
            print('No GPU available, using the CPU instead.')
    
    # ------------------------------- Data Preprocessing -------------------------------
    
    def get_cleaned_taxonomy(self, taxonomy, delimiter):
        cleaned_taxonomy = []
        for value in taxonomy:
            value = ' '.join(value.lower().split(delimiter))
            cleaned_taxonomy.append( value )
        return cleaned_taxonomy
    
    def tokenize_and_format(self, input_data, tokenizer):
        input_ids = []
        attention_masks = []

        for text in input_data:
            encoded_dict = tokenizer.encode_plus(
                text,  # Sentence to encode.
                add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                max_length=128,  # Pad & truncate all sentences.
                padding='max_length',
                truncation=True,
                return_attention_mask=True,  # Construct attn. masks.
                return_tensors='pt',  # Return pytorch tensors.
            )

            # Add the encoded sentence to the list.    
            input_ids.append(encoded_dict['input_ids'])

            # And its attention mask (simply differentiates padding from non-padding).
            attention_masks.append(encoded_dict['attention_mask'])

        # Convert the lists into tensors.
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        return input_ids, attention_masks
    
    def tokenize_and_encode_mpnet(self, input_filename, feature_col_name, label_col_name,
                                  save_filename, train_val_test, tokenizer, label_encoder):
        # input_ids, attns_masks
        data = pd.read_csv(input_filename)
        features = data[feature_col_name]
        input_ids, attention_masks = self.tokenize_and_format(features, tokenizer)

        # encode labels
        labels = data[label_col_name]
        clean_labels = self.get_cleaned_taxonomy(labels, delimiter="_")
        taxonomy_vectors = label_encoder.encode(clean_labels)
        taxonomy_tensors = torch.Tensor(taxonomy_vectors)

        # encode unique labels
        unique_labels = labels.unique()
        clean_unique_labels = self.get_cleaned_taxonomy(unique_labels, delimiter="_")
        unique_taxonomy_vectors = label_encoder.encode(clean_unique_labels)
        unique_taxonomy_tensors = torch.Tensor(unique_taxonomy_vectors)

        # dataset is a tuple:
        # dataset[0] is list of input_ids, attention_masks, true label encoding
        # dataset[1] is all label tensors encoding
        if train_val_test == "train":
            dataset = [(input_ids[i], attention_masks[i], taxonomy_tensors[i]) for i in range(len(input_ids))], unique_taxonomy_tensors
            self.train_set = dataset
        elif train_val_test == "val":
            dataset = [(input_ids[i], attention_masks[i], taxonomy_tensors[i]) for i in range(len(input_ids))], unique_taxonomy_tensors
            self.val_set = dataset
        else:
            self.test_unique_labels = unique_labels
            self.LE.fit_transform(unique_labels)  # tell LabelEncoder this is the labels we want to enocde
            test_labels = self.LE.transform(labels)  # encode all labels
            test_labels = torch.Tensor(test_labels)
            dataset = [(input_ids[i], attention_masks[i], test_labels[i]) for i in range(len(input_ids))], unique_taxonomy_tensors
            self.test_set = dataset
        
        torch.save(dataset, save_filename)
        return dataset
    
    def process_data(self, train_filename, val_filename, test_filename,
                     feature_col_name, label_col_name,
                     train_save_filename, val_save_filename, test_save_filename,
                     tokenizer, label_encoder):
        self.train_set = self.tokenize_and_encode_mpnet(input_filename=train_filename,
                                                        feature_col_name=feature_col_name,
                                                        label_col_name=label_col_name,
                                                        save_filename=train_save_filename,
                                                        train_val_test="train",
                                                        tokenizer=tokenizer,
                                                        label_encoder=label_encoder)
        
        self.val_set = self.tokenize_and_encode_mpnet(input_filename=val_filename,
                                                      feature_col_name=feature_col_name,
                                                      label_col_name=label_col_name,
                                                      save_filename=val_save_filename,
                                                      train_val_test="val",
                                                      tokenizer=tokenizer,
                                                      label_encoder=label_encoder)

        self.test_set = self.tokenize_and_encode_mpnet(input_filename=test_filename,
                                                       feature_col_name=feature_col_name,
                                                       label_col_name=label_col_name,
                                                       save_filename=test_save_filename,
                                                       train_val_test="test",
                                                       tokenizer=tokenizer,
                                                       label_encoder=label_encoder)
    
    # ------------------------------- Training -------------------------------
    
    def train(self, model, criterion, optimizer, early_stopper,
              epochs, batch_size, random_seed, output_dir, model_name):
        """
        Parameters
        ----------
        model: PyTorch Model
            Input text encoder
        criterion: 
            loss function
        optimizer: 
            optimizer used to train the model
        early_stopper:
            early stopper, required to have a 'early_stop' method to check whether needs to early stops the training
        epochs: int
            number of training epochs
        batch_size: int
            batch size
        random_seed: int
            random seed
        output_dir: string
            output directory
        model_name: string
            name of the model
        
        Returns
        -------
        """
        # Create the output path directory.
        os.makedirs(output_dir, exist_ok=True)

        # Set random seeds.
        np.random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.cuda.manual_seed(random_seed)
        random.seed(random_seed)
        
        # Tell pytorch to run this model on the GPU.
        model.cuda()
        
        summary_stats = []

        all_labels = self.train_set[1]
        all_labels = all_labels.to(self.device)

        train_set = self.train_set[0]

        for epoch_i in range(0, epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            start_time = default_timer()

            # Reset the total loss for this epoch.
            total_train_loss = 0

            # Put the model into training mode.
            model.train()

            # For each batch of training data...
            num_batches = int(len(train_set)/batch_size) + 1

            for i in range(num_batches):
                if i % 40 == 0 and i > 0:
                    print('Batch {} of {}.'.format(i, num_batches))
                end_index = min(batch_size * (i+1), len(train_set))

                batch = train_set[i*batch_size:end_index]

                if len(batch) == 0: continue

                input_id_tensors = torch.stack([data[0] for data in batch])
                input_mask_tensors = torch.stack([data[1] for data in batch])
                label_tensors = torch.stack([data[2] for data in batch])

                # Move tensors to the GPU
                b_input_ids = input_id_tensors.to(self.device)
                b_input_mask = input_mask_tensors.to(self.device)
                b_labels = label_tensors.to(self.device)

                # Clear the previously calculated gradient
                model.zero_grad()

                # Forward pass, calculate logit predictions.
                b_outputs = model(b_input_ids, b_input_mask)
                loss = criterion.forward(b_outputs, b_labels, all_labels)
                total_train_loss += loss.item()

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # Update parameters and take a step using the computed gradient.
                optimizer.step()

            avg_train_loss = total_train_loss / len(train_set)
            end_time = default_timer()
            elapsed_time = end_time - start_time
            print('Average train loss: {}'.format(avg_train_loss))

            _, avg_val_loss = self.test(model, criterion, self.val_set, batch_size)
            print('Average validation loss: {}'.format(avg_val_loss))

            summary_stats.append({
                'Epoch': epoch_i + 1,
                'Average train loss': avg_train_loss,
                'Validation loss': avg_val_loss,
                'Training time': elapsed_time
            })

            if early_stopper.early_stop(model, avg_val_loss, epoch_i, output_dir, model_name):
                print("Early stopping at epoch {}".format(epoch_i + 1))
                break

        self.training_stats = summary_stats
    
    # function to get validation accuracy
    def test(self, model, criterion, test_set, batch_size):
        
        # Put the model in evaluation mode - no drop out during eval
        model.eval()

        all_labels = test_set[1]
        all_labels = all_labels.to(self.device)

        test_set = test_set[0]

        # Tracking variables 
        total_test_loss = 0
        all_outputs = torch.Tensor().to(self.device)

        num_batches = int(len(test_set)/batch_size) + 1


        for i in range(num_batches):

          end_index = min(batch_size * (i+1), len(test_set))

          batch = test_set[i*batch_size:end_index]

          if len(batch) == 0: continue

          input_id_tensors = torch.stack([data[0] for data in batch])
          input_mask_tensors = torch.stack([data[1] for data in batch])
          label_tensors = torch.stack([data[2] for data in batch])

          # Move tensors to the GPU
          b_input_ids = input_id_tensors.to(self.device)
          b_input_mask = input_mask_tensors.to(self.device)
          b_labels = label_tensors.to(self.device)

          # Tell pytorch not to bother with constructing the compute graph during
          # the forward pass, since this is only needed for backprop (training).
          with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            b_outputs = model(b_input_ids, b_input_mask)
            loss = criterion.forward(b_outputs, b_labels, all_labels)

            # Append current outputs to all outputs
            all_outputs = torch.cat((all_outputs, b_outputs), dim=0)

            # Accumulate the validation loss.
            total_test_loss += loss.item()

        # Calculate the average loss over all of the batches.
        avg_test_loss = total_test_loss / len(test_set)

        return all_outputs, avg_test_loss
    
    def plot_train_val_loss(self, figure_path):
      # transform to dataframe
      data = pd.DataFrame(data=self.training_stats)

      # plot
      plt.rcParams["figure.figsize"] = (12,6)
      plt.plot(data['Average train loss'], '-o', label="Training Loss")
      plt.plot(data['Validation loss'], '-o', label="Validation Loss")

      plt.title("Training & Validation Loss")
      plt.xlabel("Epoch")
      plt.ylabel("Loss")
      plt.legend()
      plt.savefig(figure_path)
      plt.show()
        
    # ------------------------------- Evaluation -------------------------------

    def eval(self, model, criterion, k, random_seed):
        """
        Parameters
        ----------
        model:
            Input text encoder
        criterion:
            loss function
        k: 
            positive integer that will be used to calculate the precision/recall
        random_seed:
            random seed

        Returns
        -------
        """
        # set random seed
        np.random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.cuda.manual_seed(random_seed)
        random.seed(random_seed)

        # Tell pytorch to run this model on the GPU.
        model.cuda()
        
        # Put the model in evaluation mode - no drop out during eval
        model.eval()
        
        all_labels = self.test_set[1]  # all encoded unique labels 
        all_labels = all_labels.to(self.device)
        test_set = self.test_set[0]  # (input_id, attention_mask, true_label) tuple

        cos_sim = nn.CosineSimilarity(dim=2, eps=1e-6)
        
        y_true = []
        y_pred = []  # record predictions of the model
        
        for input_id, attention_mask, true_label in test_set:
            input_id = input_id.to(self.device)
            attention_mask = attention_mask.to(self.device)
            
            with torch.no_grad():
                outputs = model(input_id.reshape(1,-1), attention_mask.reshape(1,-1))
            
            # Find k most similar labels
            distances = cos_sim(outputs[:,None], all_labels)
            distances, indices = torch.topk(distances, k, largest=True)
            # use indices to find correspodning labels, and encode them to value between 0 and n_classes-1
            y_pred.append(self.LE.transform(self.test_unique_labels[indices.cpu().numpy().flatten()]))
            print('true_label', true_label)
            if torch.numel(true_label) > 1:
                y_true.append
            y_true.append(int(true_label))

        y_true = np.array(y_true)
        y_pred = np.array(y_pred)

        return y_pred, y_true

In [None]:
# Model for creating input embedding
class InputEncoder(nn.Module):
    def __init__(self, pretrained_model="bert-base-uncased", output_dim=1024):
        """
        Parameters
        ----------
        pretrained_model: String
            the model id of a pretrained model hosted inside a model repo on huggingface
            the list of available pretrained models can be seen here: 
            https://huggingface.co/transformers/v3.3.1/pretrained_models.html
            Note that the dim of the last hidden state is 768 in the paper, so the pretrained model should be a 'base' model

        Returns
        -------
        """
        super(InputEncoder, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)  # load pretrained bert
        self.dropout = nn.Dropout(0.1)  # use p=0.1 to align with the dropout rate in pretrained bert 
        self.linear1 = nn.Linear(768, 384)
        self.linear2 = nn.Linear(384, output_dim)
    
    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)[1]  # we need pooler_output instead of last_hidden_state
        x = self.dropout(x)
        x = self.dropout(self.linear1(x))
        x = self.linear2(x)
        return x

In [None]:
class TagRecWDCModel:
    def __init__(self, train_set_path=None, val_set_path=None, test_set_path=None, test_data_path=None, label_col_name=None):
        # training set
        if train_set_path is not None:
            self.train_set = torch.load(train_set_path)
        else: 
            self.train_set = None
        
        # validation set
        if val_set_path is not None:
            self.val_set = torch.load(val_set_path)
        else: 
            self.val_set = None
        
        # testing set
        if test_set_path is not None:
            self.test_set = torch.load(test_set_path)
        else:
            self.test_set = None
        
        self.LE = LabelEncoder()  # label encoder that encodes labels to value between 0 and n_classes-1
        self.test_unique_labels = None  # all unique labels
        if test_data_path is not None and label_col_name is not None:
            test_data = pd.read_csv(test_data_path)
            test_labels = test_data[label_col_name]
            self.test_unique_labels = test_labels.unique()
            self.LE.fit_transform(self.test_unique_labels)
        
        self.training_stats = []
        
        # Use GPU if possible
        if torch.cuda.is_available():      
            self.device = torch.device("cuda")
            print('There are %d GPU(s) available.' % torch.cuda.device_count())
            print('Using GPU:', torch.cuda.get_device_name(0))
        else:
            self.device = torch.device("cpu")
            print('No GPU available, using the CPU instead.')
        
    # ------------------------------- Evaluation -------------------------------

    def eval(self, model, criterion, k, random_seed):
        """
        Parameters
        ----------
        model:
            Input text encoder
        criterion:
            loss function
        k: 
            positive integer that will be used to calculate the precision/recall
        random_seed:
            random seed

        Returns
        -------
        """
        # set random seed
        np.random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.cuda.manual_seed(random_seed)
        random.seed(random_seed)

        # Tell pytorch to run this model on the GPU.
        model.cuda()
        
        # Put the model in evaluation mode - no drop out during eval
        model.eval()
        
        all_labels = self.test_set[1]  # all encoded unique labels 
        all_labels = all_labels.to(self.device)
        test_set = self.test_set[0]  # (input_id, attention_mask, true_label) tuple
        true_labels = self.test_set[2]

        cos_sim = nn.CosineSimilarity(dim=2, eps=1e-6)
        
        y_true = []
        y_pred = []  # record predictions of the model
        
        for test_set_data, true_label in zip(test_set, true_labels):
            input_id, attention_mask, _ = test_set_data
            input_id = input_id.to(self.device)
            attention_mask = attention_mask.to(self.device)
            
            with torch.no_grad():
                outputs = model(input_id.reshape(1,-1), attention_mask.reshape(1,-1))
            
            # Find k most similar labels
            distances = cos_sim(outputs[:,None], all_labels)
            distances, indices = torch.topk(distances, k, largest=True)
            # use indices to find correspodning labels, and encode them to value between 0 and n_classes-1
            y_pred.append(indices.cpu().numpy().flatten())
            y_true.append(int(true_label))

        y_true = np.array(y_true)
        y_pred = np.array(y_pred)

        return y_pred, y_true

In [None]:
class WDCInputEncoder(nn.Module):
  def __init__(self, bert_model_name, output_dim=1024):
    super(WDCInputEncoder, self).__init__()
    self.bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=False, output_attentions=False)
    self.fc1 = nn.Linear(768, 384)  
    self.fc2 = nn.Linear(384, output_dim)  

  def forward(self, tokens, attn_masks):
    out0 = self.bert(tokens, attention_mask=attn_masks)
    out1 = out0[1]  # pooler_output
    out2 = self.fc1(out1)  
    out3 = self.fc2(out2)  
    return out3

In [None]:
class USEEmbedding(): 
    def __init__(self, module_url="https://tfhub.dev/google/universal-sentence-encoder-large/5"):
        self.model = hub.load(module_url)
    
    def encode(self, input):
      output = self.model(input)
      return torch.tensor(output.numpy())  # convert to torch tensor

In [None]:
# Our Loss Function
class HingeRankLoss(nn.Module):
    def __init__(self, margin):
        super(HingeRankLoss, self).__init__()
        self.cos_sim = nn.CosineSimilarity(dim=2, eps=1e-6)
        self.margin = margin

    def forward(self, output, corr_labels, all_labels):
        loss = torch.relu(
            self.margin
            - self.cos_sim(output[:,None], corr_labels[:,None])
            + self.cos_sim(output[:,None], all_labels)
        )  # torch.Size([32, num_unique_labels])
        loss = torch.sum(loss)  # scalar tensor

        dup_loss = max(self.margin, 0.) * len(output)
        dup_loss = torch.tensor(dup_loss, dtype=torch.float32)
        total_loss = loss - dup_loss

        counts = len(output) * (len(all_labels) - 1)
        return total_loss / counts

In [None]:
def save_checkpoint(state_dict, is_best, epoch, output_dir, model_name):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    checkpoints_dir = './checkpoints'
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)
    checkpoint_file = os.path.join(checkpoints_dir, f'{model_name}_checkpoint_{epoch}.pth.tar')
    torch.save(state_dict, checkpoint_file)
    if is_best:
        shutil.copyfile(checkpoint_file, os.path.join(output_dir, f'{model_name}_best.pth.tar'))


class EarlyStopper:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.counter = 0
        self.min_delta = min_delta
        self.min_val_loss = np.inf

    def early_stop(self, model, val_loss, epoch, output_dir, model_name):
        if val_loss < self.min_val_loss + self.min_delta:
            print(f'=> Model at epoch {epoch + 1} is the best according to validation loss')
            save_checkpoint(state_dict=model.state_dict(), is_best=True, epoch=epoch, output_dir=output_dir, model_name=model_name)
            self.min_val_loss = val_loss
            self.counter = 0
        else:
            save_checkpoint(state_dict=model.state_dict(), is_best=False, epoch=epoch, output_dir=output_dir, model_name=model_name)
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

## Get TagRec Predictions

In [None]:
def get_tagrec_predictions(test_set_path, test_data_path, label_col_name, model_checkpoint, use_wdc=False):
    if use_wdc:
        tagrec_model = TagRecWDCModel(
            test_set_path=test_set_path,
            test_data_path=test_data_path,
            label_col_name=label_col_name,
        )
    else:
        tagrec_model = TagRecModel(
            test_set_path=test_set_path,
            test_data_path=test_data_path,
            label_col_name=label_col_name,
        )
    if use_wdc:
        eval_model = WDCInputEncoder("bert-base-uncased")
    else:
        eval_model = InputEncoder("bert-base-uncased")
    eval_model.load_state_dict(torch.load(model_checkpoint))

    criterion = HingeRankLoss(0.1)

    y_pred, y_true = tagrec_model.eval(
        model=eval_model,
        criterion=criterion,
        k=10,
        random_seed=random_seed,
    )

    return y_pred, y_true

# Baseline Definition

## Baseline: Pre-trained SentBERT

In [None]:
# Main function used to get baseline predictions.
def get_pretrained_sent_bert_baseline(test_data_path, feature_col_name, label_col_name, delimiter='_'):
    pretrained_sent_bert_encoder = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

    # This function gets embedding vectors directly from pretrained SentBERT model, for both features and labels.
    def get_pretrained_sent_bert_embeddings(csv_filepath, feature_col_name, label_col_name, delimiter='_'):
        data = pd.read_csv(csv_filepath)
        
        features = data[feature_col_name]
        labels = data[label_col_name]
        cleaned_labels = get_cleaned_taxonomy(taxonomy=labels, delimiter=delimiter)

        encoded_features = pretrained_sent_bert_encoder.encode(features)
        encoded_features = torch.tensor(encoded_features)

        return encoded_features, cleaned_labels

    test_features_embs, test_true_labels = get_pretrained_sent_bert_embeddings(
        csv_filepath=test_data_path,
        feature_col_name=feature_col_name,
        label_col_name=label_col_name,
    )

    def get_unique_labels(csv_filepath, label_col_name):
        csv_file = pd.read_csv(csv_filepath)
        labels = get_cleaned_taxonomy(taxonomy=csv_file[label_col_name], delimiter=delimiter)
        return pd.Series(labels).unique()

    label_encoder = LabelEncoder()
    test_unique_labels = get_unique_labels(
        csv_filepath=test_data_path,
        label_col_name=label_col_name,
    )
    label_encoder.fit_transform(test_unique_labels)
    test_unique_labels_tensor = torch.tensor(pretrained_sent_bert_encoder.encode(test_unique_labels))

    def get_predictions_at_k(k):
        cos_sim = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
        predictions = []
        for test_feature_emb in test_features_embs:
            distances = cos_sim(test_feature_emb[None, :], test_unique_labels_tensor)
            distances, indices = torch.topk(input=distances, k=k, largest=True)
            predictions.append(test_unique_labels[indices.cpu().numpy().flatten()])

        y_pred = np.array([label_encoder.transform(prediction) for prediction in predictions])
        y_true = np.array(label_encoder.transform(test_true_labels))

        return y_pred, y_true

    return get_predictions_at_k

## Baseline: Fine-tuned BERT

In [None]:
num_epochs = 5
batch_size = 32

In [None]:
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler
from torch.optim import AdamW
from tqdm.auto import tqdm

def get_bert_classification_baseline(train_data_path,
                                     val_data_path,
                                     test_data_path,
                                     feature_col_name,
                                     label_col_name,
                                     delimiter='_'):
    def get_cleaned_data(csv_filepath):
        data = pd.read_csv(csv_filepath)
        features = data[feature_col_name]
        labels = data[label_col_name]
        cleaned_labels = get_cleaned_taxonomy(taxonomy=labels, delimiter=delimiter)
        return features, cleaned_labels

    train_features, train_labels = get_cleaned_data(csv_filepath=train_data_path)
    val_features, val_labels = get_cleaned_data(csv_filepath=val_data_path)
    test_features, test_labels = get_cleaned_data(csv_filepath=test_data_path)

    all_unique_labels = pd.Series(train_labels + val_labels + test_labels).unique()
    label_encoder = LabelEncoder()
    label_encoder.fit_transform(all_unique_labels)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    def get_transformers_dataset(features, labels, split):
        converted_dataset = pd.DataFrame()
        converted_dataset['text'] = features
        converted_dataset['label'] = label_encoder.transform(labels)
        converted_dataset = Dataset.from_pandas(converted_dataset, split=split)

        def tokenize_function(data_entry):
            return tokenizer(data_entry["text"], padding="max_length", truncation=True)

        tokenized_datasets = converted_dataset.map(tokenize_function, batched=True)
        tokenized_datasets = tokenized_datasets.remove_columns(["text"])
        tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
        tokenized_datasets.set_format("torch")
        return tokenized_datasets

    train_dataset = get_transformers_dataset(train_features, train_labels, split='train')
    val_dataset = get_transformers_dataset(val_features, val_labels, split='val')
    test_dataset = get_transformers_dataset(test_features, test_labels, split='test')

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-cased",
        num_labels=len(all_unique_labels),
        output_hidden_states=True,
    )

    optimizer = AdamW(model.parameters(), lr=5e-5)

    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('Using GPU:', torch.cuda.get_device_name(0))
    else:
        device = torch.device('cpu')
        print('No GPU available, using the CPU instead.')

    model.to(device)

    def get_val_loss(model):
        model.eval()  # Put model in evaluation mode.
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                total_val_loss += outputs.loss
        averaged_val_loss = total_val_loss / len(val_features)
        print('val_loss:', averaged_val_loss)
        return averaged_val_loss

    progress_bar = tqdm(range(num_training_steps))

    best_val_loss = None
    best_val_loss_count = 0
    patience = 5

    for epoch in range(num_epochs):
        model.train()

        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        val_loss = get_val_loss(model)
        if best_val_loss is None or val_loss < (best_val_loss - 0.001):
            best_val_loss = val_loss
            best_val_loss_count = 0
        else:
            best_val_loss_count += 1

        if best_val_loss_count >= patience:
            print('Early stopped')
            break

    def get_eval(k):
        model.eval()

        y_pred = []
        y_true = []
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            cls_logits = outputs.logits
            values, indices = torch.topk(input=cls_logits, k=k, largest=True)
            y_pred.extend(indices)
            y_true.extend(batch['labels'])

        return y_pred, y_true

    return get_eval

## Model Cleanup

In [None]:
torch.cuda.empty_cache()

# Export Definition

In [None]:
def get_case_data(csv_filepath, feature_col_name, label_col_name):
    data = pd.read_csv(csv_filepath)
    features = data[feature_col_name]
    labels = data[label_col_name]
    return features, labels

In [None]:
def get_error_cases(features, labels, y_pred, y_true, tagrec_y_pred, tagrec_y_true):
    assert len(features) == len(labels) == len(y_pred) == len(tagrec_y_pred) == len(y_true)

    baseline_error_cases = []
    tagrec_error_cases = []
    both_error_cases = []

    for i in range(len(features)):
        curr_feature = features[i]
        curr_label = labels[i]
        curr_case = (curr_feature, curr_label)

        is_baseline_correct = y_true[i] in y_pred[i]
        is_tagrec_correct = tagrec_y_true[i] in tagrec_y_pred[i]

        if is_baseline_correct and not is_tagrec_correct:
            tagrec_error_cases.append(curr_case)
        elif not is_baseline_correct and is_tagrec_correct:
            baseline_error_cases.append(curr_case)
        elif not is_baseline_correct and not is_tagrec_correct:
            both_error_cases.append(curr_case)

    return baseline_error_cases, tagrec_error_cases, both_error_cases

In [None]:
import csv

def write_error_cases_to_csv(error_cases, filename):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['feature', 'label'])
        for row in error_cases:
            writer.writerow(row)

def analyze_error_cases(test_set_path,
                        test_data_path,
                        model_checkpoint,
                        y_pred,
                        y_true,
                        feature_col_name,
                        label_col_name,
                        output_dir=None,
                        use_wdc=False):
    tagrec_y_pred, tagrec_y_true = get_tagrec_predictions(
        test_set_path=test_set_path,
        test_data_path=test_data_path,
        label_col_name=label_col_name,
        model_checkpoint=model_checkpoint,
        use_wdc=use_wdc,
    )
    case_features, case_labels = get_case_data(
        csv_filepath=test_data_path,
        feature_col_name=feature_col_name,
        label_col_name=label_col_name,
    )
    baseline_error_cases, tagrec_error_cases, both_error_cases = get_error_cases(
        case_features,
        case_labels,
        y_pred,
        y_true,
        tagrec_y_pred,
        tagrec_y_true,
    )

    if output_dir is not None:
        os.makedirs(output_dir, exist_ok=True)
        write_error_cases_to_csv(baseline_error_cases, os.path.join(output_dir, 'baseline_error_cases.csv'))
        write_error_cases_to_csv(tagrec_error_cases, os.path.join(output_dir, 'tagrec_error_cases.csv'))
        write_error_cases_to_csv(both_error_cases, os.path.join(output_dir, 'both_error_cases.csv'))

    return baseline_error_cases, tagrec_error_cases, both_error_cases

# Evaluation: ARC

## Pre-trained SentBERT

In [None]:
arc_pretrained_sent_bert_baseline = get_pretrained_sent_bert_baseline(
    test_data_path='data/ARC_data_test.csv',
    feature_col_name='Question',
    label_col_name='QCLabel',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = arc_pretrained_sent_bert_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

recall@5: 0.31214285714285717
recall@10: 0.45571428571428574
recall@15: 0.5385714285714286
recall@20: 0.585


In [None]:
y_pred, y_true = arc_pretrained_sent_bert_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/ARC_sent_bert_test.pt',
    test_data_path='data/ARC_data_test.csv',
    model_checkpoint='models/ARC_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='Question',
    label_col_name='QCLabel',
    output_dir='./baseline_analysis/ARC_PreBERT'
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(559, 69, 203)

## BERT Classification

In [None]:
arc_bert_classification_baseline = get_bert_classification_baseline(
    train_data_path='data/ARC_data_train.csv',
    val_data_path='data/ARC_data_val.csv',
    test_data_path='data/ARC_data_test.csv',
    feature_col_name='Question',
    label_col_name='QCLabel',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = arc_bert_classification_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

Map:   0%|          | 0/5597 [00:00<?, ? examples/s]

Map:   0%|          | 0/778 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Using GPU: NVIDIA A100-SXM4-40GB


  0%|          | 0/875 [00:00<?, ?it/s]

val_loss: tensor(0.1530, device='cuda:0')
val_loss: tensor(0.1296, device='cuda:0')
val_loss: tensor(0.1159, device='cuda:0')
val_loss: tensor(0.1090, device='cuda:0')
val_loss: tensor(0.1065, device='cuda:0')
recall@5: 0.6135714285714285
recall@10: 0.7028571428571428
recall@15: 0.7564285714285715
recall@20: 0.79


In [None]:
y_pred, y_true = arc_bert_classification_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/ARC_sent_bert_test.pt',
    test_data_path='data/ARC_data_test.csv',
    model_checkpoint='models/ARC_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='Question',
    label_col_name='QCLabel',
    output_dir='./baseline_analysis/ARC_BERTClass'
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(221, 77, 195)

# Evaluation: Non-hierarchical ARC

## Pre-trained SentBERT

In [None]:
nharc_pretrained_sent_bert_baseline = get_pretrained_sent_bert_baseline(
    test_data_path='data/NonHierarchical_ARC_raw_test.csv',
    feature_col_name='question',
    label_col_name='label',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = nharc_pretrained_sent_bert_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

Downloading (…)7cff1/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading (…)e20647cff1/README.md:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading (…)0647cff1/config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)7cff1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)e20647cff1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)647cff1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

recall@5: 0.25642857142857145
recall@10: 0.3385714285714286
recall@15: 0.39285714285714285
recall@20: 0.43214285714285716


In [None]:
y_pred, y_true = nharc_pretrained_sent_bert_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/NonHierarchical_ARC_sent_bert_test.pt',
    test_data_path='data/NonHierarchical_ARC_raw_test.csv',
    model_checkpoint='models/NHARC_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='question',
    label_col_name='label',
    output_dir='./baseline_analysis/NHARC_PreBERT'
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(691, 63, 235)

## BERT Classification

In [None]:
nharc_bert_classification_baseline = get_bert_classification_baseline(
    train_data_path='data/NonHierarchical_ARC_raw_train.csv',
    val_data_path='data/NonHierarchical_ARC_raw_val.csv',
    test_data_path='data/NonHierarchical_ARC_raw_test.csv',
    feature_col_name='question',
    label_col_name='label',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = nharc_bert_classification_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/5597 [00:00<?, ? examples/s]

Map:   0%|          | 0/778 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Using GPU: NVIDIA A100-SXM4-40GB


  0%|          | 0/875 [00:00<?, ?it/s]

val_loss: tensor(0.1529, device='cuda:0')
val_loss: tensor(0.1264, device='cuda:0')
val_loss: tensor(0.1114, device='cuda:0')
val_loss: tensor(0.1049, device='cuda:0')
val_loss: tensor(0.1022, device='cuda:0')
recall@5: 0.65
recall@10: 0.7471428571428571
recall@15: 0.7914285714285715
recall@20: 0.8178571428571428


In [None]:
y_pred, y_true = nharc_bert_classification_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/NonHierarchical_ARC_sent_bert_test.pt',
    test_data_path='data/NonHierarchical_ARC_raw_test.csv',
    model_checkpoint='models/NHARC_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='question',
    label_col_name='label',
    output_dir='./baseline_analysis/NHARC_BERTClass'
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(172, 116, 182)

# Evaluation: Khan Academy

## Pre-trained SentBERT

In [None]:
khan_pretrained_sent_bert_baseline = get_pretrained_sent_bert_baseline(
    test_data_path='data/KhanAcad_mod_test.csv',
    feature_col_name='video_transcripts',
    label_col_name='hierarchy',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = khan_pretrained_sent_bert_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

recall@5: 0.1518624641833811
recall@10: 0.23400191021967526
recall@15: 0.28653295128939826
recall@20: 0.32855778414517667


In [None]:
y_pred, y_true = khan_pretrained_sent_bert_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/KhanAcad_sent_bert_test.pt',
    test_data_path='data/KhanAcad_mod_test.csv',
    model_checkpoint='models/KHAN_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='video_transcripts',
    label_col_name='hierarchy',
    output_dir='./baseline_analysis/KhanAcad_PreBERT'
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(494, 32, 308)

## BERT Classification

In [None]:
khan_bert_classification_baseline = get_bert_classification_baseline(
    train_data_path='data/KhanAcad_mod_train.csv',
    val_data_path='data/KhanAcad_mod_val.csv',
    test_data_path='data/KhanAcad_mod_test.csv',
    feature_col_name='video_transcripts',
    label_col_name='hierarchy',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = khan_bert_classification_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

Map:   0%|          | 0/4188 [00:00<?, ? examples/s]

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Map:   0%|          | 0/1047 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Using GPU: NVIDIA A100-SXM4-40GB


  0%|          | 0/655 [00:00<?, ?it/s]

val_loss: tensor(0.1793, device='cuda:0')
val_loss: tensor(0.1584, device='cuda:0')
val_loss: tensor(0.1465, device='cuda:0')
val_loss: tensor(0.1402, device='cuda:0')
val_loss: tensor(0.1379, device='cuda:0')
recall@5: 0.4326647564469914
recall@10: 0.5234001910219676
recall@15: 0.5644699140401146
recall@20: 0.6026743075453678


In [None]:
y_pred, y_true = khan_bert_classification_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/KhanAcad_sent_bert_test.pt',
    test_data_path='data/KhanAcad_mod_test.csv',
    model_checkpoint='models/KHAN_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='video_transcripts',
    label_col_name='hierarchy',
    output_dir='./baseline_analysis/KhanAcad_BERTClass'
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(238, 79, 261)

# Evaluation: WDC



## Pre-trained SentBERT

In [None]:
wdc_pretrained_sent_bert_baseline = get_pretrained_sent_bert_baseline(
    test_data_path='data/WDC_clean_test.csv',
    feature_col_name='feature',
    label_col_name='label',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = wdc_pretrained_sent_bert_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

recall@5: 0.17692767483562463
recall@10: 0.2683801554094441
recall@15: 0.3239689181111775
recall@20: 0.3747758517632995


In [None]:
y_pred, y_true = wdc_pretrained_sent_bert_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/WDC_test_set_bert_large.pt',
    test_data_path='data/WDC_clean_test.csv',
    model_checkpoint='models/WDC_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='feature',
    label_col_name='label',
    output_dir='./baseline_analysis/WDC_PreBERT',
    use_wdc=True,
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(1148, 20, 76)

## BERT Classification

In [None]:
wdc_bert_classification_baseline = get_bert_classification_baseline(
    train_data_path='data/WDC_clean_train.csv',
    val_data_path='data/WDC_clean_val.csv',
    test_data_path='data/WDC_clean_test.csv',
    feature_col_name='feature',
    label_col_name='label',
)
for k in [5, 10, 15, 20]:
    y_pred, y_true = wdc_bert_classification_baseline(k=k)
    _, recall_score = recall_at_k(y_pred, y_true)
    print(f'recall@{k}: {recall_score}')

Map:   0%|          | 0/5852 [00:00<?, ? examples/s]

Map:   0%|          | 0/836 [00:00<?, ? examples/s]

Map:   0%|          | 0/1673 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Using GPU: NVIDIA A100-SXM4-40GB


  0%|          | 0/915 [00:00<?, ?it/s]

val_loss: tensor(0.0821, device='cuda:0')
val_loss: tensor(0.0674, device='cuda:0')
val_loss: tensor(0.0582, device='cuda:0')
val_loss: tensor(0.0538, device='cuda:0')
val_loss: tensor(0.0524, device='cuda:0')
recall@5: 0.8374178123132098
recall@10: 0.8780633592349073
recall@15: 0.8924088463837417
recall@20: 0.9019725044829647


In [None]:
y_pred, y_true = wdc_bert_classification_baseline(k=10)
baseline_error_cases, tagrec_error_cases, both_error_cases = analyze_error_cases(
    test_set_path='data/WDC_test_set_bert_large.pt',
    test_data_path='data/WDC_clean_test.csv',
    model_checkpoint='models/WDC_BERT_SENT_BERT_best.pth.tar',
    y_pred=y_pred,
    y_true=y_true,
    feature_col_name='feature',
    label_col_name='label',
    output_dir='./baseline_analysis/WDC_BERTClass',
    use_wdc=True,
)
len(baseline_error_cases), len(tagrec_error_cases), len(both_error_cases)

There are 1 GPU(s) available.
Using GPU: NVIDIA A100-SXM4-40GB


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(142, 34, 62)