# Package preparation

In [None]:
!pip install transformers
!nvidia-smi

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 11.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 56.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

## Import necessary packages

In [None]:
import torch
from torch import nn, Tensor # neural network
from torch.nn import functional as F

# numerical matrix processing
import numpy as np 
from numpy import ndarray

# data/parameter loading
import pandas as pd 
import pickle

# visualization
from tqdm.notebook import trange, tqdm

# transfomers
from transformers import AutoTokenizer, AutoModel
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions

# code instruction
from typing import Union, List, Dict
# filter out warnings
import warnings
warnings.filterwarnings('ignore')

## Some useful functions

In [None]:
# Utils
def save_parameter(save_object, save_file):
    with open(save_file, 'wb') as f:
        pickle.dump(save_object, f, protocol=pickle.HIGHEST_PROTOCOL)

def load_parameter(load_file):
    with open(load_file, 'rb') as f:
        output = pickle.load(f)
    return output

def sim_matrix(a, b, eps=1e-8):
    """
    Calculate cosine similarity between two matrices. 
    Note: added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.clamp(a_n, min=eps)
    b_norm = b / torch.clamp(b_n, min=eps)
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

def batch2device(batch, device):
    """
    Transfer batch of training to GPU/CPU
    Args:
        batch: Dict[str, Tensor], represent for transformer input (input_ids, attention_mask)
        device: torch.device, GPU or CPU
    """
    for key, value in batch.items():
        batch[key] = batch[key].to(device)
    return batch

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# Data preparation

In [None]:
work_path = "/content/drive/MyDrive/PaperRecommendation/"
checkpoint_path = work_path + "checkpoint/"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
data_train = pd.read_csv(checkpoint_path + "data/01_train.csv", encoding = "ISO-8859-1")
data_validate = pd.read_csv(checkpoint_path + "data/01_validate.csv", encoding = "ISO-8859-1")
data_test = pd.read_csv(checkpoint_path + "data/01_test.csv", encoding = "ISO-8859-1")
data_aims = pd.read_csv(checkpoint_path + "data/01_aims.csv", encoding = "ISO-8859-1")

data_train.fillna("", inplace=True)
data_validate.fillna("", inplace=True)
data_test.fillna("", inplace=True)
data_aims.fillna("", inplace=True)

n_classes = len(data_aims)

## Feature selection

In [None]:
X_train = (
    data_train['Title']  
    + " " + data_train['Abstract']
    + " " + data_train['Keywords']
    ).tolist()
X_valid = (
    data_validate['Title']  
    + " " + data_validate['Abstract']
    + " " + data_validate['Keywords']
    ).tolist()
X_test = (
    data_test['Title']  
    + " " + data_test['Abstract']
    + " " + data_test['Keywords']
    ).tolist()

X_aims = data_aims["Aims"].tolist()

Y_train = data_train['Label'].tolist()
Y_validate = data_validate['Label'].tolist()
Y_test = data_test['Label'].tolist()

## Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
# train_encodings = tokenizer(
#     X_train,
#     truncation=True,
#     padding="max_length",
#     max_length=300,
#     return_tensors="pt"
# )
# valid_encodings = tokenizer(
#     X_valid,
#     truncation=True,
#     padding="max_length",
#     max_length=300,
#     return_tensors="pt"
# )
# test_encodings = tokenizer(
#     X_test,
#     truncation=True,
#     padding="max_length",
#     max_length=300,
#     return_tensors="pt"
# )

# save_parameter(train_encodings, checkpoint_path + "pickle/distilroberta_training_encodings.pickle")
# save_parameter(valid_encodings, checkpoint_path + "pickle/distilroberta_valid_encodings.pickle")
# save_parameter(test_encodings, checkpoint_path + "pickle/distilroberta_test_encodings.pickle")

train_encodings = load_parameter(checkpoint_path + "pickle/distilroberta_training_encodings.pickle")
valid_encodings = load_parameter(checkpoint_path + "pickle/distilroberta_valid_encodings.pickle")
test_encodings = load_parameter(checkpoint_path + "pickle/distilroberta_test_encodings.pickle")

## Data loader

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        x = {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        }
        y = torch.tensor(self.labels[idx])
        return x, y
    def __len__(self):
        return len(self.labels)

In [None]:
# Dataset
train_dataset = Dataset(train_encodings, Y_train)
valid_dataset = Dataset(valid_encodings, Y_validate)
test_dataset = Dataset(test_encodings, Y_test)

In [None]:
# Data loaders
train_loader = torch.utils.data.DataLoader(train_dataset,
                                         batch_size=16,
                                         shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                         batch_size=8,
                                         shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                         batch_size=8,
                                         shuffle=False)

# Model definition

## Pooler layer

In [None]:
class Pooler(nn.Module):
    """
    Parameter-free poolers to get the sentence embedding
    'cls': [CLS] representation with BERT/RoBERTa's MLP pooler.
    'cls_before_pooler': [CLS] representation without the original MLP pooler.
    'avg': average of the last layers' hidden states at each token.
    'avg_top2': average of the last two layers.
    'avg_first_last': average of the first and the last layers.
    """
    def __init__(self, pooler_type):
        super().__init__()
        self.pooler_type = pooler_type
        assert self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"], "unrecognized pooling type %s" % self.pooler_type

    def forward(self, attention_mask, outputs):
        last_hidden = outputs.last_hidden_state
        hidden_states = outputs.hidden_states

        if self.pooler_type in ['cls_before_pooler', 'cls']:
            return last_hidden[:, 0]
        elif self.pooler_type == "avg":
            return ((last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1))
        elif self.pooler_type == "avg_first_last":
            first_hidden = hidden_states[0]
            last_hidden = hidden_states[-1]
            pooled_result = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        elif self.pooler_type == "avg_top2":
            second_last_hidden = hidden_states[-2]
            last_hidden = hidden_states[-1]
            pooled_result = ((last_hidden + second_last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        else:
            raise NotImplementedError


## Sentence Embedder

In [None]:
class ModelForSE(nn.Module):
    def __init__(self, model_name_or_path, pooler_type):
        super(ModelForSE, self).__init__()
        '''
        Model for sentence embedding
        '''
        self.bert = AutoModel.from_pretrained(model_name_or_path)
        self.pooler_type = pooler_type
        self.pooler = Pooler(self.pooler_type)
        
    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        mlm_input_ids=None,
        mlm_labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=True if self.pooler_type in ['avg_top2', 'avg_first_last'] else False,
            return_dict=return_dict,
        )
        if self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"]:
            pooler_output = self.pooler(attention_mask, outputs)
        
        return BaseModelOutputWithPoolingAndCrossAttentions(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )
    def encode(self, sentences: Union[str, List[str]],
               batch_size: int = 8,
               show_progress_bar: bool = None,
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None) -> Union[List[Tensor], ndarray, Tensor]:
        self.eval()

        if convert_to_tensor:
            convert_to_numpy = False

        input_was_string = False

        if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
            sentences = [sentences]
            input_was_string = True

        if device is None:
            device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.to(device)

        all_embeddings = []
        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
            sentence_batch = sentences[start_index: start_index+batch_size]
            features = tokenizer(sentence_batch,
                       padding='max_length', 
                       truncation=True, 
                       max_length=300,
                       return_tensors='pt').to(device)
            
            with torch.no_grad():
                out_features = self.forward(**features)
                embeddings = []
                # gather the embedding vectors
                for row in out_features.pooler_output:
                    embeddings.append(row.cpu())
                all_embeddings.extend(embeddings)
        if convert_to_tensor:
            all_embeddings = torch.vstack(all_embeddings)
        elif convert_to_numpy:
            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
        
        if input_was_string:
            all_embeddings = all_embeddings[0]
        return all_embeddings

## Load fine-tuned LM

In [None]:
# Fine-tuned LM checkpoint (by contrastive learning)
checkpoint_cl = torch.load(checkpoint_path + "Epoch:09 New-SupCL-DistilRoBERTa.pth")


# Baseline model of sentence embeddings
model_args = {
    "model_name_or_path": "distilroberta-base",
    "pooler_type": "cls_before_pooler"
}
base_model = ModelForSE(**model_args)
base_model.load_state_dict(checkpoint_cl["model_state_dict"])

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
class NoAim_Classifier(nn.Module):
    def __init__(self, base_model, num_classes):
        super(NoAim_Classifier, self).__init__()
        self.base_model = base_model
        self.linear1_1 = nn.Linear(768, 512)
        self.act1_1 = nn.ReLU()
        self.drop1_1 = nn.Dropout(0.1)

        self.linear1_2 = nn.Linear(512, num_classes)
        self.logsoftmax = nn.LogSoftmax(dim=1) 

    def forward(self, inputs_tak):
        '''
        Args:
            inputs_tak: (dict) batch of TAK samples, shape as [bs, n_samples, encoding_dim]
            inputs_aims: (tensor) batch of aims embeddings taken by cls tokens, shape as [bs, n_samples, hidden_size]
        '''
        output_tak = self.base_model(**inputs_tak)
        x = output_tak.last_hidden_state[:,0,:] # cls tokens
        x = self.linear1_1(x)
        x = self.act1_1(x)
        x = self.drop1_1(x)
        
        x = self.linear1_2(x)
        return self.logsoftmax(x)

# Training

In [None]:
# load checkpoint and continue training
model = NoAim_Classifier(base_model, n_classes)
model.to(device)

NoAim_Classifier(
  (base_model): ModelForSE(
    (bert): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_features=768, o

## Optimizer and Loss function

In [None]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.96)

# Loss function
loss_fn = nn.NLLLoss().to(device)

## Training settings

In [None]:
max_epochs = 7
topks = [1, 3, 5, 10]
history = {
    "train_loss": [],
    "val_loss": [],
    "train_acc@k": [],
    "val_acc@k": [],
}
min_valid_loss = np.inf

## Training loop

In [None]:
for epoch in range(max_epochs):
    train_loss = 0.0
    train_loop = tqdm(train_loader, leave=True)
    batch_train_accuracy = {k: 0 for k in topks}
    batch_valid_accuracy = {k: 0 for k in topks}
    num_correct_at_k = {
        "train": {k: 0 for k in topks},
        "val": {k: 0 for k in topks}
    }
    # Training
    model.train()

    for features, labels in train_loop:
                
        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            features, labels = batch2device(features, device), labels.to(device)
        # forward pass
        logits = model(features)
        # Clear the gradients
        optimizer.zero_grad()
        # Find the Loss
        loss = loss_fn(logits, labels)
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
        # Calculate accuracy
        probs_des = torch.argsort(torch.exp(logits), axis=1, descending=True)
        for k in topks:
            batch_num_correct = 0
            nPoints = len(labels)
            for i in range(nPoints):
                if labels[i] in probs_des[i, 0:k]:
                    batch_num_correct += 1
                    num_correct_at_k["train"][k] += 1 # globally counting number of correct at each k's for whole valid set
            batch_train_accuracy[k] = batch_num_correct / nPoints
        # Calculate Loss
        train_loss += loss.item()
        train_loop.set_description('Epoch: {0} - lr: {1}, Training'.format(epoch, optimizer.param_groups[0]['lr']))
        train_loop.set_postfix(train_loss=loss.item(), 
                               top01=batch_train_accuracy[1], 
                               top03=batch_train_accuracy[3], 
                               top05=batch_train_accuracy[5],
                               top10=batch_train_accuracy[10])
    train_loss = train_loss/len(train_loader)
    history["train_loss"].append(train_loss)
    history["train_acc@k"].append(
        {k: val/len(X_train) for k, val in num_correct_at_k["train"].items()}
    )

    # Validation
    valid_loss = 0.0
    valid_loop = tqdm(valid_loader, leave=True)
    with torch.no_grad():
        model.eval()
        # Transfer Data to GPU if available
        for features, labels in valid_loop:

            if torch.cuda.is_available():
                features, labels = batch2device(features, device), labels.to(device)
            # Forward pass
            logits = model(features)
            
            # Find the Loss
            loss = loss_fn(logits, labels)
            # Calculate accuracy
            probs_des = torch.argsort(torch.exp(logits), axis=1, descending=True)
            for k in topks:
                num_correct = 0
                nPoints = len(labels)
                for i in range(nPoints):
                    if labels[i] in probs_des[i, 0:k]:
                        num_correct += 1
                        num_correct_at_k["val"][k] += 1 # globally counting number of correct at each k's for whole valid set
                batch_valid_accuracy[k] = num_correct / nPoints
            # Calculate Loss
            valid_loss += loss.item()
            valid_loop.set_description('Epoch: {0} - lr: {1}, Validating'.format(epoch, optimizer.param_groups[0]['lr']))
            valid_loop.set_postfix(val_loss=loss.item(), 
                                val_top01=batch_valid_accuracy[1], 
                                val_top03=batch_valid_accuracy[3], 
                                val_top05=batch_valid_accuracy[5],
                                val_top10=batch_valid_accuracy[10])
        valid_loss = valid_loss/len(valid_loader)
        history["val_loss"].append(valid_loss)
        history["val_acc@k"].append(
            {k: val/len(X_valid) for k, val in num_correct_at_k["val"].items()}
        )
        print(f'>> Epoch {epoch} \t\t Training Loss: {train_loss} \t\t Validation Loss: {valid_loss}')
        # lr_scheduler.step()

        if min_valid_loss > valid_loss:
            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
            min_valid_loss = valid_loss
            
            # Saving State Dict
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'history': history,
                    'epoch': epoch
                }, checkpoint_path + "weight/Epoch:{:0>2} DistilRoberta_TAK.pth".format(epoch)
            )

# Testing

In [None]:
# load checkpoint and testing
checkpoint = torch.load(checkpoint_path + "weight/Epoch:03 DisRoberta(TAK).pth")

model = NoAim_Classifier(base_model, n_classes)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)

history = checkpoint['history']

In [None]:
# Loss function
loss_fn = nn.NLLLoss().to(device)

# Test 
topks = [1, 3, 5, 10]
num_correct_at_k = {}
test_loop = tqdm(test_loader, leave=True)
num_correct_at_k["test"] = {k: 0 for k in topks}
batch_test_accuracy = {k: [] for k in topks}
history["test_acc@k"] = []
history["test_loss"] = []
test_loss = 0.0

with torch.no_grad():
    model.eval() 
    for features, labels in test_loop:
        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            features, labels = batch2device(features, device), labels.to(device)
        logits = model(features)
        # Find the Loss
        loss = loss_fn(logits, labels)
        # Calculate accuracy
        probs_des = torch.argsort(torch.exp(logits), axis=1, descending=True)
        for k in topks:
            num_correct = 0
            nPoints = len(labels)
            for i in range(nPoints):
                if labels[i] in probs_des[i, 0:k]:
                    num_correct += 1
                    num_correct_at_k["test"][k] += 1 # globally counting number of correct at each k's for whole valid set
            batch_test_accuracy[k] = num_correct / nPoints
        # Calculate Loss
        test_loss += loss.item()
        test_loop.set_description('Testing...')
        test_loop.set_postfix(test_loss=loss.item(), 
                            test_top01=batch_test_accuracy[1], 
                            test_top03=batch_test_accuracy[3], 
                            test_top05=batch_test_accuracy[5],
                            test_top10=batch_test_accuracy[10])
    test_loss = test_loss/len(test_loader)
    history["test_loss"].append(test_loss)
    history["test_acc@k"].append(
        {k: val/len(X_test) for k, val in num_correct_at_k["test"].items()}
    )

  0%|          | 0/10381 [00:00<?, ?it/s]

# Final results

In [None]:
print(">> Final results (Best model): ")
print("\tTraining loss: {}".format(history["train_loss"][-1]))
print("\tValidating loss: {}".format(history["val_loss"][-1]))
print("\tTesting loss: {}".format(history["test_loss"][-1]))
print("\n")
for k in topks:
    print("\tTrain accuracy@{}: {}".format(k, history["train_acc@k"][-1][k]))
print("\n")
for k in topks:
    print("\tValidate accuracy@{}: {}".format(k, history["val_acc@k"][-1][k]))
print("\n")
for k in topks:
    print("\tTest accuracy@{}: {}".format(k, history["test_acc@k"][-1][k]))

>> Final results (Best model): 
	Training loss: 1.2283474295100631
	Validating loss: 1.4882379100122218
	Testing loss: 1.4920472683294104


	Train accuracy@1: 0.5709195252030558
	Train accuracy@3: 0.8553250401418625
	Train accuracy@5: 0.9213387101640201
	Train accuracy@10: 0.9711950710150612


	Validate accuracy@1: 0.5190816665158234
	Validate accuracy@3: 0.8107521042628292
	Validate accuracy@5: 0.8883156846773463
	Validate accuracy@10: 0.9507647750927686


	Test accuracy@1: 0.5173032463153838
	Test accuracy@3: 0.8097124554474521
	Test accuracy@5: 0.8862344668143725
	Test accuracy@10: 0.9495592910124265
