In [1]:
import json
import pandas as pd
import re
import string 
from nltk.corpus import stopwords

### loading json file 

In [2]:
t = []
with open("E:\\function_summarization\\datasets\\nfcorpus\\corpus.jsonl", 'r') as file:
    for line in file:
        t.append(json.loads(line))

In [3]:
corpus=[]
list1=[]
for i in range(0,len(t)-1):
    # print(i)
    for k,v in t[i].items():
        if k=='text':
            c=t[i][k]
            corpus.append(c)
#             print(corpus)

### Preprocessing

In [4]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

clean_passages = [remove_punctuation(passage) for passage in corpus]

In [5]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        # Tokenize the sentence into words
        words = sentence.split()
        # Remove stop words and rejoin the words to form the cleaned sentence
        cleaned_sentence = ' '.join([word for word in words if word.lower() not in stop_words])
        cleaned_sentences.append(cleaned_sentence)
    return cleaned_sentences

In [6]:
cleaned_text_list = remove_stopwords(clean_passages)

In [7]:
cleaned_text_list[0]

'Recent studies suggested statins established drug group prevention cardiovascular mortality could delay prevent breast cancer recurrence effect diseasespecific mortality remains unclear evaluated risk breast cancer death among statin users populationbased cohort breast cancer patients study cohort included newly diagnosed breast cancer patients Finland 19952003 31236 cases identified Finnish Cancer Registry Information statin use diagnosis obtained national prescription database used Cox proportional hazards regression method estimate mortality among statin users statin use timedependent variable total 4151 participants used statins median followup 325 years diagnosis range 00890 years 6011 participants died 3619 602 due breast cancer adjustment age tumor characteristics treatment selection postdiagnostic prediagnostic statin use associated lowered risk breast cancer death HR 046 95 CI 038055 HR 054 95 CI 044067 respectively risk decrease postdiagnostic statin use likely affected heal

In [8]:
query="Breast Cancer Cells Feed on Cholesterol"

### calculating embeddings for query using large and small models

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch

small_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
small_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")



In [10]:
large_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
large_model = AutoModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

In [11]:
# Get embeddings for the query
query_small=small_tokenizer(query,padding=True,truncation=True,return_tensors='pt')
query_large=large_tokenizer(query,padding=True,truncation=True,return_tensors='pt')

In [12]:
with torch.no_grad():
    query_small=small_model(**query_small)
    query_large=large_model(**query_large)
#     print(query_small)
q_small_emb=query_small.last_hidden_state.mean(dim=1)
q_large_emb=query_large.last_hidden_state.mean(dim=1)
print(q_small_emb)

tensor([[-1.9124e-01, -4.2612e-01, -5.5825e-01, -7.8522e-02, -3.0761e-01,
         -1.7760e-01, -2.9396e-02, -1.8586e-01,  1.7783e-01, -6.0575e-01,
         -2.2265e-01,  1.1777e-01, -2.3045e-01, -7.1610e-01,  1.7781e-01,
          4.2638e-01, -3.4270e-01,  1.7197e-01,  2.0063e-02,  3.4023e-01,
          2.5618e-02, -2.1915e-01, -3.4811e-02,  5.2988e-01, -1.3165e-01,
         -3.9036e-01,  7.2193e-02,  1.7190e-01, -6.0912e-01,  1.5412e-02,
          8.6511e-02, -1.9129e-01, -2.9618e-01,  3.2074e-01, -3.8309e-01,
         -1.2076e-01,  1.4657e-01,  4.6712e-01, -3.8458e-01, -7.1034e-02,
          2.0458e-01, -3.7315e-01,  1.5309e-01,  2.0877e-01,  2.3467e-01,
         -4.8437e-01, -3.0059e-01, -6.3794e-02,  6.0033e-02,  3.7779e-01,
         -3.7142e-01, -1.1354e-02, -5.6691e-01, -1.0326e-01,  9.9764e-02,
          2.4917e-01,  6.8832e-03,  1.6551e-01,  3.0395e-01, -3.2164e-02,
          1.2559e-01, -6.4166e-01,  4.6814e-01,  2.9513e-01, -1.6362e-01,
         -4.3280e-01, -3.6345e-01,  6.

### calculating embeddings for preprocessed text using large and small models

In [13]:
def process_batches(passages,tokenizer,batch_size=2): #batch size controls how many passages are processed at a time
#     embeddings = []
    token=[]
    for i in range(0,len(passages),batch_size):
        batch=passages[i:i+batch_size]
        tokens=tokenizer(batch,padding=True,truncation=True,return_tensors='pt')
#         print(tokens)
        token.append(tokens)
    return token
small_tokens=process_batches(cleaned_text_list,small_tokenizer)
large_tokens=process_batches(cleaned_text_list,large_tokenizer)

In [14]:
print(small_tokens[0])

{'input_ids': tensor([[  101,  3522,  2913,  4081, 28093,  7076,  2511,  4319,  2177,  9740,
         22935, 13356,  2071,  8536,  4652,  7388,  4456, 28667,  3126, 24413,
          3466,  7870,  5051,  6895,  8873,  2278, 13356,  3464, 10599, 16330,
          3891,  7388,  4456,  2331,  2426, 28093,  2378,  5198,  2313, 15058,
          2094,  2522, 27794,  7388,  4456,  5022,  2817,  2522, 27794,  2443,
          4397, 11441,  7388,  4456,  5022,  6435,  2786, 28332,  2509, 21036,
         21619,  3572,  4453,  6983,  4456, 15584,  2592, 28093,  2378,  2224,
         11616,  4663,  2120, 20422,  7809,  2109,  9574, 14267, 22010, 26237,
          4118, 10197, 13356,  2426, 28093,  2378,  5198, 28093,  2378,  2224,
         22313, 13699, 10497,  4765,  8023,  2561, 24690,  2487,  6818,  2109,
         28093,  7076,  3991,  3582,  6279, 19652,  2086, 11616,  2846,  4002,
          2620, 21057,  2086,  3438, 14526,  6818,  2351,  4029, 16147,  3438,
          2475,  2349,  7388,  4456, 1

In [15]:
def compute_embeddings(tokenized_batches, model):
    embeddings_list = []
    for tokens in tokenized_batches:
        with torch.no_grad():
            model_output = model(**tokens)
        embeddings = model_output.last_hidden_state.mean(dim=1)  # Mean pooling
        embeddings_list.append(embeddings)
    return torch.cat(embeddings_list, dim=0)

# Get embeddings from tokenized batches
small_embeddings = compute_embeddings(small_tokens, small_model)
# large_embeddings = compute_embeddings(large_tokens, large_model)

In [16]:
print(small_embeddings.shape)
#number of passages you processed= 3632
#The size of each embedding vector =384

torch.Size([3632, 384])


In [17]:
large_embeddings = compute_embeddings(large_tokens, large_model)

In [18]:
large_embeddings.shape

torch.Size([3632, 768])

In [21]:
from sentence_transformers.util import cos_sim 

### applying cosine similarities

In [22]:
small_cosine_scores = cos_sim(q_small_emb, small_embeddings)[0]
large_cosine_scores = cos_sim(q_large_emb, large_embeddings)[0]

In [23]:
k = 5
small_top_k_indices = torch.topk(small_cosine_scores, k).indices
large_top_k_indices = torch.topk(large_cosine_scores, k).indices
print(small_top_k_indices)

tensor([1377, 1378, 1383, 2078,   82])


### Top 5 similarity passages from small and large models

In [24]:
small_top_k=[]
print("Top-k Passages from Small Model:")
for idx in small_top_k_indices:
    s=clean_passages[idx]
    small_top_k.append(s)
    print(small_top_k)

Top-k Passages from Small Model:
['The specific role of dietary fat in breast cancer progression is unclear although a lowfat diet was associated with decreased recurrence of estrogen receptor alpha negative ER breast cancer ER basallike MDAMB231 and MDAMB436 breast cancer cell lines contained a greater number of cytoplasmic lipid droplets compared to luminal ER MCF7 cells Therefore we studied lipid storage functions in these cells Both triacylglycerol and cholesteryl ester CE concentrations were higher in the ER cells but the ability to synthesize CE distinguished the two types of breast cancer cells Higher baseline oleic acid and LDLstimulated CE concentrations were found in ER compared to ER cells The differences corresponded to greater mRNA and protein levels of acylCoAcholesterol acyltransferase 1 ACAT1 higher ACAT activity higher caveolin1 protein levels greater LDL uptake and lower de novo cholesterol synthesis in ER cells Human LDL stimulated proliferation of ER MDAMB231 cells 

In [25]:
print("\nTop-k Passages from Large Model:")
large_top_k=[]
for idx in large_top_k_indices:
    l=clean_passages[idx]
    large_top_k.append(l)
    print(large_top_k)


Top-k Passages from Large Model:
['The specific role of dietary fat in breast cancer progression is unclear although a lowfat diet was associated with decreased recurrence of estrogen receptor alpha negative ER breast cancer ER basallike MDAMB231 and MDAMB436 breast cancer cell lines contained a greater number of cytoplasmic lipid droplets compared to luminal ER MCF7 cells Therefore we studied lipid storage functions in these cells Both triacylglycerol and cholesteryl ester CE concentrations were higher in the ER cells but the ability to synthesize CE distinguished the two types of breast cancer cells Higher baseline oleic acid and LDLstimulated CE concentrations were found in ER compared to ER cells The differences corresponded to greater mRNA and protein levels of acylCoAcholesterol acyltransferase 1 ACAT1 higher ACAT activity higher caveolin1 protein levels greater LDL uptake and lower de novo cholesterol synthesis in ER cells Human LDL stimulated proliferation of ER MDAMB231 cells

###  ranking model

In [26]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
ranking_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")
ranking_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")

  return self.fget.__get__(instance, owner)()


In [None]:
#### as there is same output in small_top_k and large_top_k, I am using small_top_k list


In [27]:
combined_top_k_passages = small_top_k+large_top_k

In [28]:
combined_top_k_passages

['The specific role of dietary fat in breast cancer progression is unclear although a lowfat diet was associated with decreased recurrence of estrogen receptor alpha negative ER breast cancer ER basallike MDAMB231 and MDAMB436 breast cancer cell lines contained a greater number of cytoplasmic lipid droplets compared to luminal ER MCF7 cells Therefore we studied lipid storage functions in these cells Both triacylglycerol and cholesteryl ester CE concentrations were higher in the ER cells but the ability to synthesize CE distinguished the two types of breast cancer cells Higher baseline oleic acid and LDLstimulated CE concentrations were found in ER compared to ER cells The differences corresponded to greater mRNA and protein levels of acylCoAcholesterol acyltransferase 1 ACAT1 higher ACAT activity higher caveolin1 protein levels greater LDL uptake and lower de novo cholesterol synthesis in ER cells Human LDL stimulated proliferation of ER MDAMB231 cells but had little effect on prolifer

In [29]:
query="Breast Cancer Cells Feed on Cholesterol"

In [30]:
query_passage_pairs = [(query, passage) for passage in combined_top_k_passages]

In [31]:
query_passage_pairs

[('Breast Cancer Cells Feed on Cholesterol',
  'The specific role of dietary fat in breast cancer progression is unclear although a lowfat diet was associated with decreased recurrence of estrogen receptor alpha negative ER breast cancer ER basallike MDAMB231 and MDAMB436 breast cancer cell lines contained a greater number of cytoplasmic lipid droplets compared to luminal ER MCF7 cells Therefore we studied lipid storage functions in these cells Both triacylglycerol and cholesteryl ester CE concentrations were higher in the ER cells but the ability to synthesize CE distinguished the two types of breast cancer cells Higher baseline oleic acid and LDLstimulated CE concentrations were found in ER compared to ER cells The differences corresponded to greater mRNA and protein levels of acylCoAcholesterol acyltransferase 1 ACAT1 higher ACAT activity higher caveolin1 protein levels greater LDL uptake and lower de novo cholesterol synthesis in ER cells Human LDL stimulated proliferation of ER MD

In [32]:
tokens = ranking_tokenizer(query_passage_pairs, padding=True, truncation=True, return_tensors="pt")

In [33]:
tokens

{'input_ids': tensor([[ 101, 7388, 4456,  ...,    0,    0,    0],
        [ 101, 7388, 4456,  ...,    0,    0,    0],
        [ 101, 7388, 4456,  ..., 4456, 2609,  102],
        ...,
        [ 101, 7388, 4456,  ...,    0,    0,    0],
        [ 101, 7388, 4456,  ...,    0,    0,    0],
        [ 101, 7388, 4456,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

### calculating scores

In [34]:
with torch.no_grad():
    outputs = ranking_model(**tokens)
    scores = outputs.logits.squeeze()
    print(scores)

tensor([ 3.5693, -6.4103, -4.0037, -4.8537, -5.5676,  3.5693, -4.8537, -8.3372,
        -1.8037, -6.4103])


In [35]:
sorted_indices = torch.argsort(scores, descending=True)
sorted_passages=[]
for i in sorted_indices:
    sorted_passage = combined_top_k_passages[i]
    sorted_passages.append(sorted_passage)

### Top passages according to ranking model

In [36]:
for passage in sorted_passages:
    print(passage)

The specific role of dietary fat in breast cancer progression is unclear although a lowfat diet was associated with decreased recurrence of estrogen receptor alpha negative ER breast cancer ER basallike MDAMB231 and MDAMB436 breast cancer cell lines contained a greater number of cytoplasmic lipid droplets compared to luminal ER MCF7 cells Therefore we studied lipid storage functions in these cells Both triacylglycerol and cholesteryl ester CE concentrations were higher in the ER cells but the ability to synthesize CE distinguished the two types of breast cancer cells Higher baseline oleic acid and LDLstimulated CE concentrations were found in ER compared to ER cells The differences corresponded to greater mRNA and protein levels of acylCoAcholesterol acyltransferase 1 ACAT1 higher ACAT activity higher caveolin1 protein levels greater LDL uptake and lower de novo cholesterol synthesis in ER cells Human LDL stimulated proliferation of ER MDAMB231 cells but had little effect on proliferat

In [38]:
scores

tensor([ 3.5693, -6.4103, -4.0037, -4.8537, -5.5676,  3.5693, -4.8537, -8.3372,
        -1.8037, -6.4103])

In [41]:
import numpy as np

In [42]:
def dcg_at_k(scores, k):
    """Compute DCG at rank k"""
    relevance_scores = np.array(scores)[:k]
    return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))