#*Cross-Lingual Medical Terminology* *Retrieval System for Tamil, Telugu, English*

##*Parse Medical Terms and Meanings in english from Harvard medical glossary*

In [None]:
!pip install requests
!pip install beautifulsoup4


In [23]:
import requests
from bs4 import BeautifulSoup

main_url = "https://www.health.harvard.edu/a-through-c"

# Get the main page content
response = requests.get(main_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all the links in the main page
links = soup.find_all("a", href=True)


# Filter the links to get the ones containing medical terms
medical_term_links = [link['href'] for link in links if "/medical-dictionary-of-health-terms/" in link['href']]
medical_term_links2=[]
for term in medical_term_links[:26]:
  medical_term_links2.append(term[-19:])


def save_terms_to_file(terms_list, file_name):
    with open(file_name, 'a') as f:
        for term in terms_list:
            f.write(f"{term}\n")


def get_terms_from_page2(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    term_containers = soup.find_all("p")

    terms_and_definitions = []
    for container in term_containers:
        strong_tag = container.find("strong")
        if strong_tag:
            term = strong_tag.text.strip()
            strong_tag.decompose()  # Remove the strong tag
            definition = container.text.strip()
            terms_and_definitions.append((term, definition))

    return terms_and_definitions


# Function to extract terms from a page
def get_terms_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    terms = soup.find_all("strong")
    return [term.text.strip() for term in terms]

# Iterate over the medical term links and get the terms
all_terms = []

for link in medical_term_links2:
    full_url = f"https://www.health.harvard.edu/{link}"
    terms = get_terms_from_page2(full_url)
    all_terms.extend(terms)

file_name = 'medical_definitions.txt'

save_terms_to_file(all_terms, file_name)
print(f"Medical definitions saved to {file_name}")


Medical definitions saved to medical_definitions.txt


##*Random sampling 500(N) words from 14000 medical terms*
Need to scale to more than atleast 2000


In [24]:
import random

# Read the medical terms from the text file
with open("medical_terms.txt", "r") as file:
    terms = [line.strip() for line in file]

# Shuffle the terms
random.shuffle(terms)

# Choose a random subset of 500 terms
random_subset = terms[:500]

# Write the random subset to a new text file
with open("random_medical_terms.txt", "w") as file:
    for term in random_subset:
        file.write(term + "\n")


##*Generate JSON Tamil, Telugu translations for the N words*

In [None]:
pip install translate


In [38]:
from translate import Translator
import json
import random

# Read the random medical terms from the text file
with open("random_medical_terms.txt", "r") as file:
    terms = [line.strip() for line in file.readlines()]

translator_telugu = Translator(to_lang="te")
translator_tamil = Translator(to_lang="ta")

translations = []

for term in terms:
    telugu_translation = translator_telugu.translate(term)
    tamil_translation = translator_tamil.translate(term)
    
    translations.append({
        "English": term,
        "Telugu": telugu_translation,
        "Tamil": tamil_translation
    })

# Save the translations to a JSON file
with open("translations.json", "w") as file:
    json.dump(translations, file, indent=2, ensure_ascii=False)


##*Generate parallel embeddings for the translated terms in the dataset*

###Install Transformers library

In [2]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


###Load bert-multilingual model

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


###Move the model to device

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

###Function to generate embeddings

In [5]:
def get_mbert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embeddings


###Load translated terms json file

In [6]:
import json

with open("translations.json", "r") as f:
    terms_data = json.load(f)


###Call to generate embeddings of loaded terms 

In [22]:
terms_with_embeddings = []
for term_entry in terms_data:
    embedding_entry = {}
    for lang, term in term_entry.items():
      #print(lang,term)
      embedding = get_mbert_embedding(term)
      embedding_entry[f"{lang}_term"] = term
      embedding_entry[f"{lang}_embedding"] = embedding.tolist()
    terms_with_embeddings.append(embedding_entry)
    #break
#print(terms_with_embeddings)
#print(len(terms_with_embeddings[0]['English_embedding']))
with open("terms_with_embeddings.json", "w") as f:
  json.dump(terms_with_embeddings, f, ensure_ascii=False, indent=2)

##*Build retrieval system based on the embeddings generated*

###Load libraries and embeddings json

In [23]:
import numpy as np
from scipy.spatial.distance import cosine

def cosine_similarity(a, b):
    return 1 - cosine(a, b)

with open("terms_with_embeddings.json", "r") as f:
    terms_with_embeddings = json.load(f)

###Function to retrieve top-K words in cross language

In [41]:
def retrieve_top_k(query, language_of_interest=None, k=10):
    if language_of_interest and language_of_interest not in ["Telugu", "Tamil","English"]:
        print(f"Language '{language_of_interest}' is not available.")
        return []

    query_embedding = get_mbert_embedding(query)
    similarities = []

    for entry in terms_with_embeddings:
        if not language_of_interest:
            languages_to_check = ["Telugu", "Tamil","English"]
        else:
            languages_to_check = [language_of_interest]

        for lang in languages_to_check:
            similarity = cosine_similarity(query_embedding, entry[f"{lang}_embedding"])
            similarities.append((entry[f"{lang}_term"], similarity, lang))

    top_k = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
    return top_k

##Query the retrieval system

In [43]:
query = "బదిలీ"
language = ""
top_k = retrieve_top_k(query, language_of_interest=language, k=30)

print(f"Top {len(top_k)} most similar terms for '{query}' in {language}:")
for term, similarity, lang in top_k:
    print(f"{term} ({lang}): {similarity:.4f}")

Top 30 most similar terms for 'బదిలీ' in :
బదిలీ (Telugu): 1.0000
బలం శిక్షణ (Telugu): 0.7801
గాయం (Telugu): 0.7255
గాయం (Telugu): 0.7255
ఒత్తిడి (Telugu): 0.7245
ఒత్తిడి (Telugu): 0.7245
నమ్మకం (Telugu): 0.7021
నమ్మకం (Telugu): 0.7021
கவண் (Tamil): 0.6719
వేరు (Telugu): 0.6626
తెలుపు విషయం (Telugu): 0.6614
తెలుపు విషయం (Telugu): 0.6614
సెట్ (Telugu): 0.6579
நம்பிக்கை (Tamil): 0.6537
வளர்சிதைமாற்றம் (Tamil): 0.6484
கட்டி (Tamil): 0.6454
స్లింగ్ (Telugu): 0.6450
సమయోచిత (Telugu): 0.6419
இடமாற்றம் (Tamil): 0.6415
இடமாற்றம் (Tamil): 0.6415
ఉపశమనం (Telugu): 0.6394
ఒత్తిడి పరీక్ష (Telugu): 0.6370
కత్తి (Telugu): 0.6356
மாரடைப்பு (Tamil): 0.6329
బైపాస్ (Telugu): 0.6304
జుట్టు బల్బ్ (Telugu): 0.6283
తటస్థ అమరిక (Telugu): 0.6276
bowel (English): 0.6236
సమకాలీకరణ (Telugu): 0.6219
డీలరియం (Telugu): 0.6191


###Command line approach using parser

In [44]:
import argparse

def main():
    parser = argparse.ArgumentParser(description="Retrieve top-k similar terms in the specified language")
    parser.add_argument("query", type=str, help="Query term")
    parser.add_argument("-l", "--language", type=str, default=None, help="Language of interest (Telugu, Tamil, or None)")
    parser.add_argument("-k", type=int, default=10, help="Number of top similar terms to retrieve")

    args = parser.parse_args()

    query = args.query
    language = args.language
    top_k = args.k

    top_k_terms = retrieve_top_k(query, language_of_interest=language, k=top_k)

    if language:
        print(f"Top {len(top_k_terms)} most similar terms for '{query}' in {language}:")
    else:
        print(f"Top {len(top_k_terms)} most similar terms for '{query}' in Telugu and Tamil:")

    for term, similarity, lang in top_k_terms:
        print(f"{term} ({lang}): {similarity:.4f}")

if __name__ == "__main__":
    main()
