#Cross-encoder

In [None]:
!pip install transformers datasets huggingface_hub

In [None]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

datasets = load_dataset("PhilipMay/stsb_multi_mt", "en")

In [None]:
datasets

In [None]:
datasets['train'][0]

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset

In [None]:
# Tokenize the sentences
train_inputs = tokenizer(
    datasets["train"]["sentence1"],
    datasets["train"]["sentence2"],
    padding='longest',    # Add padding to ensure consistent length (optional)
    return_tensors='pt'      # Return PyTorch tensors
)

In [None]:
train_inputs['input_ids'][0]

In [None]:
# Tokenize the sentences
valid_inputs = tokenizer(
    datasets["dev"]["sentence1"],
    datasets["dev"]["sentence2"],
    padding='longest',    # Add padding to ensure consistent length (optional)
    return_tensors='pt'      # Return PyTorch tensors
)

In [None]:
# Tokenize the sentences
test_inputs = tokenizer(
    datasets["test"]["sentence1"],
    datasets["test"]["sentence2"],
    padding='longest',    # Add padding to ensure consistent length (optional)
    return_tensors='pt'      # Return PyTorch tensors
)

In [None]:
def normalize_tensor_range(tensor, new_min=-0, new_max=1):

    min_val = tensor.min()
    max_val = tensor.max()
    print(min_val, max_val)

    # Scale to [0, 1]
    tensor_scaled = (tensor - min_val) / (max_val - min_val)

    # Scale to [new_min, new_max]
    tensor_normalized = tensor_scaled * (new_max - new_min) + new_min

    return tensor_normalized

# Example tensors
labels = torch.tensor(datasets['train']['similarity_score'])
valid_labels = torch.tensor(datasets['dev']['similarity_score'])
test_labels = torch.tensor(datasets['test']['similarity_score'])


# Normalize tensors to the range [-1, 1]
normalized_labels = normalize_tensor_range(labels)
valid_normalized_labels = normalize_tensor_range(valid_labels)
test_normalized_labels = normalize_tensor_range(test_labels)



In [None]:
normalized_labels

In [None]:
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], normalized_labels)
validation_dataset = TensorDataset(valid_inputs['input_ids'], valid_inputs['attention_mask'], valid_normalized_labels)
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_normalized_labels)


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
valid_dataloader = DataLoader(validation_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=64)

In [None]:
# for batch in train_dataloader:
#     print(batch)
#     break

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
bert = BertModel.from_pretrained('bert-base-uncased')

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class CrossEncoder(nn.Module):

  def __init__(self, bert):
    super().__init__()
    self.bert = bert
    self.dropout = nn.Dropout(0.3)
    self.linear = nn.Linear(768, 1)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    bert_output = outputs.pooler_output
    bert_output = self.dropout(bert_output)
    return self.linear(bert_output)

In [None]:
from tqdm import tqdm
def train(epochs, lr, model, train_dataloader, valid_dataloader, opt_func=torch.optim.Adam):
  history = []
  optimizer = opt_func(model.parameters(), lr)
  for epoch in range(epochs):
    model.train()
    train_losses = []
    valid_losses = []
    for batch in tqdm(train_dataloader):
      input_ids = batch[0].to(device)
      attention_mask = batch[1].to(device)
      labels = batch[2].to(device)

      output = model(input_ids=input_ids, attention_mask=attention_mask)
      output = torch.sigmoid(output).squeeze()
      loss = torch.nn.MSELoss()(output, labels)
      train_losses.append(loss)
      loss.backward() #calculate gradients
      optimizer.step()
      optimizer.zero_grad()
    total_loss = torch.stack(train_losses).mean().item()
    print('epoc_training_loss', total_loss)
    # history.append(total_loss)

    model.eval()
    with torch.no_grad():
      for batch in tqdm(valid_dataloader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        output = torch.sigmoid(output).squeeze()
        loss = torch.nn.MSELoss()(output, labels)
        valid_losses.append(loss)
      total_loss = torch.stack(valid_losses).mean().item()
      print('epoc_validation_loss', total_loss)
    # history.append(total_loss)


In [None]:
model = CrossEncoder(bert)
model.to(device)
model.train()

In [None]:
epochs = 1
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)

In [None]:
epochs = 6
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)

In [None]:
epochs = 2
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)