### Softmax classification loss 
- Not used in practice

- Models are trained on either siamese network or a triplet network. 

### Siamese netowrk
- two copies of the same BERT
- Sentence A and B are fed into BERT model to produce token embedding 
- Pooling operation is used to create sentence embedding (mean average pooling)
- Goal is to optimize to get sentence embedds as close as possible for similar sentences 
- Sentence vectors are concatinated to create vector (u, v, |u-v|) with dimensionality of 768 * 3
- The concatinated vector is fed into a FFNN
- FFNN output is 3 output activations 
    - Within the NLI training data there are entailment, nuetral, and contridiction 
- Output is the "true" label 
- Optomized using cross enthropy loss which contains a softmax loss function within  

### Multiple connectors ranking loss (MNR)
- Used in practice 

In [1]:
import datasets 
# SNLI: Stanford Natural Language Inference dataset 
snli = datasets.load_dataset('snli', split = 'train')
snli
#Sentence A is premise, B is the hypothesis

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 550152
})

In [2]:
# MNLI comes from the glue dataset
mnli = datasets.load_dataset('glue', 'mnli', split='train')
mnli
#mnli has an extra column for index
#In order to merge the datasets we need to reformat the mnli dataset 
mnli = mnli.remove_columns(['idx'])

snli = snli.cast(mnli.features)             #change schema using cast so we can combine dataset 

dataset = datasets.concatenate_datasets([snli,mnli])
dataset                     #contains 942854 rows including error rows
#rows with -1 are errors, therfore want to remove. 

dataset = dataset.filter(
    lambda x: False if x['label'] == -1 else True           #function selects rows where the label value is not -1, keeps others 
)
dataset

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 942069
})

In [None]:
from sentence_transformers import InputExample #Input example is just a data format used by the library 
from tqdm.auto import tqdm                     #for progress bar 

train_samples = []
for row in tqdm(dataset):
    train_samples.append(InputExample(
        texts=[row['premise'], row['hypothesis']],  #the input processed into the model 
        label = row['label']                        #these are feature names from our dataset  
    ))


In [None]:
#Require a dataloader (sometimes you can use dataloaders from the sentence tranformer library)
#Using pytorch as dataloader 
from torch.utils.data import DataLoader 

batch_size = 16 
loader = DataLoader(train_samples, batch_size=batch_size, shuffle = True)

#Initialize the model using sentence transformers 
# ST uses modules to settle the model. will have a trasformer module and pooling module for mean pooling layer 
 
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('bert-base-uncased') #using the hugging face model 
pooler = models.Pooling(bert.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)

model = SentenceTransformer(modules=[bert, pooler]) #initialize, put sentence transformer name in the ()
model
#outputs sentence tranformer "structure"

In [5]:
#initialize loss function 
from sentence_transformers import losses 

loss = losses.SoftmaxLoss(
    model = model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels =3
)

In [None]:
import torch

# Clear CUDA memory first
torch.cuda.empty_cache()

# Set memory usage limits
torch.cuda.set_per_process_memory_fraction(0.8)  # Use 80% of GPU memory

# Move model to appropriate device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
print(f'moved to {device}')

# Print available GPU memory for debugging
if torch.cuda.is_available():
    print(f'GPU Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
    print(f'GPU Memory Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB')
    print(f'GPU Memory Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB')

In [None]:
import torch
import os

def get_device_with_memory_check(model, required_memory_gb=4):
    if torch.cuda.is_available():
        # Check available GPU memory
        gpu = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(gpu)
        total_memory = gpu_properties.total_memory / 1e9  # Convert to GB
        allocated_memory = torch.cuda.memory_allocated(gpu) / 1e9
        free_memory = total_memory - allocated_memory
        
        if free_memory > required_memory_gb:
            print(f'Using GPU with {free_memory:.2f}GB free memory')
            return torch.device('cuda')
        else:
            print(f'GPU memory low ({free_memory:.2f}GB free), falling back to CPU')
            return torch.device('cpu')
    return torch.device('cpu')

# Use it before training
device = get_device_with_memory_check(model)
model.to(device)

# Then train
epochs = 1
warmup_steps = int(len(loader)*epochs*0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs, 
    warmup_steps=warmup_steps, 
    output_path = '/sbert_test_b',
    show_progress_bar = False
)

In [None]:
#mean pooling function using pytorch 
def mean_pool(token_embeds, attention_mask):
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    pool = torch.sum(token_embeds*in_mask, 1) /torch.clamp(         #pooling method takes token embeddings and compresses into a single 768 dimentional vector (from 512)
        in_mask.sum(1), min=1e-9)
    return pool

