# LaBASE filterting application - for final AdvNLP project



In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import pandas as pd
from tqdm import tqdm

In [None]:
# Upload data
file_id = '1-M187jqKwBmTWHX4wyRvglYx6vjOe-nz'
!gdown --id $file_id

# Load the TSV file into a Pandas DataFrame
df = pd.read_csv("medical_corpus_clean.tsv", sep=',')

Downloading...
From: https://drive.google.com/uc?id=1-M187jqKwBmTWHX4wyRvglYx6vjOe-nz
To: /content/medical_corpus_clean.tsv
100% 237M/237M [00:04<00:00, 53.7MB/s]


In [None]:
# Download the model and tokenizer from the LaBSE in transformers library
model_name = "sentence-transformers/LaBSE"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [None]:
# Assign device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to CUDA if available
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(501153, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
import torch.nn.functional as F

def cosine_similarity_custom(x, y):
    # Calculate cosine similarity
    dot_product = torch.sum(x * y, dim=-1)
    norm_x = torch.norm(x, dim=-1)
    norm_y = torch.norm(y, dim=-1)

    similarity = dot_product / (norm_x * norm_y)

    return similarity

def filter_bilingual_corpus_batched(source_sentences, target_sentences, sentence_ids, batch_size=16):
    similarity_metric = []
    sentence_pairs_ids = []

    # Calculate the total number of sentence pairs
    total_pairs = len(source_sentences)

    # Iterate through sentence pairs with tqdm for the progress bar
    for i in tqdm(range(0, total_pairs, batch_size), desc="Processing Batches"):
        batch_sources = source_sentences[i:i + batch_size]
        batch_targets = target_sentences[i:i + batch_size]
        batch_ids = sentence_ids[i:i + batch_size]

        # Tokenize and generate embeddings for source and target sentences
        source_tokens = tokenizer(batch_sources, return_tensors="pt", padding=True, truncation=True).to(device)
        target_tokens = tokenizer(batch_targets, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            source_embedding_mean = model(**source_tokens).last_hidden_state.mean(dim=1)
            target_embedding_mean = model(**target_tokens).last_hidden_state.mean(dim=1)

        # Calculate similarity between corresponding embeddings
        similarities = cosine_similarity_custom(source_embedding_mean, target_embedding_mean)

        # Take the similarity score for each sentence pair within the batch
        for j, similarity in enumerate(similarities.cpu().numpy()):
            similarity_metric.append(similarity)
            # Save the common pair of sentence IDs for the entire batch
            sentence_pairs_ids.append(batch_ids[j])

    return similarity_metric, sentence_pairs_ids

In [None]:
# Use functions to generate similarity measures for the sentence pairs
similarity_metric, sentence_pairs_ids = filter_bilingual_corpus_batched(df['pol'].tolist(), df['eng'].tolist(), df['id'].tolist(), batch_size=8)

Processing Batches: 100%|██████████| 134182/134182 [2:22:02<00:00, 15.74it/s]


In [None]:
# Create a DataFrame
output_df = pd.DataFrame({'id': sentence_pairs_ids,'score': similarity_metric})

In [None]:
# Save dataframe as csv
output_df.to_csv('LaBSE_scores.csv', index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>