Run `docker compose up` to set up OpenSearch

### Baseline
Search function in OpenSearch

In [32]:
from opensearchpy import OpenSearch
import pandas as pd
from tqdm import tqdm

index_name = 'abstracts'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
os_mapping = {
    "settings": {
        "index": {
            "number_of_shards": 1,
            "number_of_replicas": 1,
            "knn": True,
        }
    },
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "abstract": {
                "type": "text",
                "analyzer": "english"
            }
        }
    }
}
client.indices.create(index_name, body=os_mapping, ignore=400)
df = pd.read_csv('data_cluster.csv').drop('Keywords', axis=1).drop('Cluster', axis=1)
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    abstract = row['Abstract'].replace("\n", " ")
    client.index(index_name, {
        "pmid": pmid,
        "abstract": abstract
    })

100%|████████████████████████████████████████████████████████████████████████████| 58933/58933 [03:05<00:00, 318.32it/s]


In [14]:
import pandas as pd
from tqdm import tqdm
from opensearchpy import OpenSearch

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

df_qa = pd.read_csv('qap.csv').drop('Answer', axis=1)
data_list = df_qa.values.tolist()
data_list = data_list[:1000]
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    res = client.search(index='abstracts', body={
        "query": {
            "match": {
                "abstract": query
            }
        },
        "size": 5
    })
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 128.87it/s]

Correct PMID found in top 5 results for 74.6% of queries.





This is a fairly good result. However, we suspect that the questions we are currently generating might be too biased towards lexical search.

### Experiment 1  
According to[Semantic Search and Vector Databases (Part 2).ipynb](https://moodle.uni-heidelberg.de/pluginfile.php/1294993/mod_label/intro/Semantic%20Search%20and%20Vector%20Databases%20%28Part%202%29.ipynb?time=1701955836513), try to use BERT (bi-encoder) to compute the embeddings for the abstracts (No text splitting) and similarity scores are calculated by calculating the mean.

In [1]:
import torch
from transformers import BertModel, BertTokenizer
from opensearchpy import OpenSearch
import pandas as pd
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_version = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_version)
model = BertModel.from_pretrained(bert_version)
model = model.eval()
model = model.to(device)

index_name = 'abstracts_bert_nosplit'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 768
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

df = pd.read_csv('data_cluster.csv').drop('Keywords', axis=1).drop('Cluster', axis=1)
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    title = row['ArticleTitle']
    abstract = row['Abstract'].replace("\n", " ")
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    authors = list({term.lower() for term in row["Authors"]})
    
    encoding = tokenizer(abstract, return_tensors='pt', truncation=True)
    encoding = encoding.to(device)
    with torch.no_grad():
        output = model(**encoding)
    output = output[0].mean(dim=1).squeeze().to('cpu')
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": output.numpy().tolist(),
        "publishedDate": publishedDate,
        "authors": authors,
    })

  from .autonotebook import tqdm as notebook_tqdm
100%|█████████████████████████████████████████████████████████████████████████████| 58933/58933 [17:03<00:00, 57.57it/s]


In [5]:
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    encoding = tokenizer(query, return_tensors='pt')
    encoding = encoding.to(device)
    with torch.no_grad():
        output = model(**encoding)
    output = output[0].mean(dim=1).squeeze().to('cpu')
    body = {
        "size": 10,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": output.numpy().tolist(),
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index='abstracts_bert_nosplit', body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 10 results for {percentage_top_5}% of queries.")

100%|█████████████████████████████████████████████████████████████████████████████████| 300/300 [00:32<00:00,  9.14it/s]

Correct PMID found in top 10 results for 21.333333333333336% of queries.





The results of this experiment are very poor. We overlooked the maximum input limitation of the BERT model. This might be the primary reason for such bad outcomes.

### Experiment 2  
According to[Semantic Search and Vector Databases (Part 3).ipynb](https://moodle.uni-heidelberg.de/pluginfile.php/1299406/mod_label/intro/Semantic%20Search%20and%20Vector%20Databases%20%28Part%203%29.ipynb?time=1702551190164), try to use Sentence-BERT (SBERT) to compute the embeddings for the abstracts (text splitting) and similarity scores are calculated by calculating the mean.

In [10]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from opensearchpy import OpenSearch

embed_model_id = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(embed_model_id)

splitter = SentenceTransformersTokenTextSplitter(
    model_name=embed_model_id,  # specify the model used for tokenization
    chunk_overlap=10,  # set the overlap between consecutive text chunks
)

index_name = 'abstracts_sbert'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 768
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

df = pd.read_csv('data_cluster.csv').drop('Keywords', axis=1).drop('Cluster', axis=1)
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    title = row['ArticleTitle']
    abstract = row['Abstract'].replace("\n", " ")
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    authors = list({term.lower() for term in row["Authors"]})
    chunks = splitter.split_text(text=abstract)
    for j, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()
        client.index(index_name, {
            "pmid": pmid,
            "title": title,
            "vector": embedding,
            "publishedDate": publishedDate,
            "authors": authors,
            "text_chunk_id": j,
            "arxiv_text": chunk,
        })

In [13]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

embed_model_id = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(embed_model_id)


top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    embedding = model.encode(query).tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index='abstracts_sbert', body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:58<00:00,  8.47it/s]

Correct PMID found in top 5 results for 54.900000000000006% of queries.





The result this time is 55%. We believe the reason is due to the bias in the question search and the fact that it has not yet been fine-tuned according to our dataset.

### Experiment 3
fine-tuning Sentence-BERT (SBERT)

In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from tqdm import tqdm
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

training_df = pd.read_csv('qap.csv')
abstracts_df = pd.read_csv('data_cluster.csv')
merged_df = training_df.merge(abstracts_df, how='left', left_on='PMID', right_on='PMID')

filtered_df = merged_df.dropna(subset=['Question', 'Abstract'])
filtered_df = filtered_df[filtered_df['Question'].str.strip() != '']
filtered_df = filtered_df[filtered_df['Abstract'].str.strip() != '']

embed_model_id = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(embed_model_id)
splitter = SentenceTransformersTokenTextSplitter(
    model_name=embed_model_id,
    chunk_overlap=10
)

training_examples = []
for _, row in tqdm(filtered_df.iterrows(), total=filtered_df.shape[0]):
    question = row['Question']
    abstract = row['Abstract'].replace("\n", " ")
    abstract_chunks = splitter.split_text(text=abstract)
    for chunk in abstract_chunks:
        training_examples.append(InputExample(texts=[question, chunk], label=1.0))

train_dataloader = DataLoader(training_examples, batch_size=16, shuffle=True)
train_loss = losses.CosineSimilarityLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
model.save('sbert_fin')

Iteration: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 909/909 [5:17:08<00:00, 20.93s/it]
Epoch: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [5:17:08<00:00, 19028.90s/it]


In [3]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from opensearchpy import OpenSearch
import pandas as pd
from tqdm import tqdm

embed_model_id = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer('sbert_fin')

splitter = SentenceTransformersTokenTextSplitter(
    model_name='sbert_fin',  # specify the model used for tokenization
    chunk_overlap=10,  # set the overlap between consecutive text chunks
)

index_name = 'abstracts_sbert_fin_1'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 768
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

df = pd.read_csv('data_cluster.csv').drop('Keywords', axis=1).drop('Cluster', axis=1)
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    title = row['ArticleTitle']
    abstract = row['Abstract'].replace("\n", " ")
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    authors = list({term.lower() for term in row["Authors"]})
    chunks = splitter.split_text(text=abstract)
    for j, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()
        client.index(index_name, {
            "pmid": pmid,
            "title": title,
            "vector": embedding,
            "publishedDate": publishedDate,
            "authors": authors,
            "text_chunk_id": j,
            "arxiv_text": chunk,
        })

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58933/58933 [21:47<00:00, 45.06it/s]


In [9]:
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    embedding = model.encode(query).tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index='abstracts_sbert_fin_1', body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

  0%|                                                                                                                                                             | 0/300 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:37<00:00,  8.08it/s]

Correct PMID found in top 5 results for 15.666666666666668% of queries.





This experiment failed because only positive samples were generated and no negative samples when preparing the dataset for fine-tuning.

### Experiment 4
fine-tuning Sentence-BERT (SBERT) with negative sample

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from tqdm import tqdm
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
import random

training_df = pd.read_csv('qap.csv')
abstracts_df = pd.read_csv('data_cluster.csv')
merged_df = training_df.merge(abstracts_df, how='left', left_on='PMID', right_on='PMID')

filtered_df = merged_df.dropna(subset=['Question', 'Abstract'])
filtered_df = filtered_df[filtered_df['Question'].str.strip() != '']
filtered_df = filtered_df[filtered_df['Abstract'].str.strip() != '']

embed_model_id = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(embed_model_id)
splitter = SentenceTransformersTokenTextSplitter(
    model_name=embed_model_id,
    chunk_overlap=10
)

training_examples = []
for _, row in tqdm(filtered_df.iterrows(), total=filtered_df.shape[0]):
    question = row['Question']
    correct_abstract = row['Abstract'].replace("\n", " ")
    correct_chunks = splitter.split_text(text=correct_abstract)

    for chunk in correct_chunks:
        training_examples.append(InputExample(texts=[question, chunk], label=1.0))

    for _ in range(2):
        incorrect_abstract = filtered_df[filtered_df['Abstract'] != correct_abstract].sample(1)['Abstract'].iloc[0].replace("\n", " ")
        incorrect_chunks = splitter.split_text(text=incorrect_abstract)

        if incorrect_chunks:
            incorrect_chunk = random.choice(incorrect_chunks)
            training_examples.append(InputExample(texts=[question, incorrect_chunk], label=0.0))

train_dataloader = DataLoader(training_examples, batch_size=16, shuffle=True)
train_loss = losses.CosineSimilarityLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)
model.save('sbert_fin_2')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12254/12254 [02:18<00:00, 88.40it/s]
Iteration: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2440/2440 [13:01<00:00,  3.12it/s]
Epoch: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [13:01<00:00, 781.32s/it]


In [4]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from opensearchpy import OpenSearch
import pandas as pd
from tqdm import tqdm

model = SentenceTransformer('sbert_fin_2')

splitter = SentenceTransformersTokenTextSplitter(
    model_name='sbert_fin_2',  # specify the model used for tokenization
    chunk_overlap=10,  # set the overlap between consecutive text chunks
)

index_name = 'abstracts_sbert_fin_2'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 768
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

df = pd.read_csv('data_cluster.csv').drop('Keywords', axis=1).drop('Cluster', axis=1)
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    title = row['ArticleTitle']
    abstract = row['Abstract'].replace("\n", " ")
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    authors = list({term.lower() for term in row["Authors"]})
    chunks = splitter.split_text(text=abstract)
    for j, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()
        client.index(index_name, {
            "pmid": pmid,
            "title": title,
            "vector": embedding,
            "publishedDate": publishedDate,
            "authors": authors,
            "text_chunk_id": j,
            "arxiv_text": chunk,
        })

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58933/58933 [21:58<00:00, 44.69it/s]


In [12]:
top_5_hits = 0

model = SentenceTransformer('sbert_fin_2')

for query, correct_pmid in tqdm(data_list):
    embedding = model.encode(query).tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index='abstracts_sbert_fin_2', body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:59<00:00,  8.39it/s]

Correct PMID found in top 5 results for 62.4% of queries.





62% of the questions were matched with abstracts. However, this is still lower than the results from OpenSearch.

Next Steps. A few ideas for improving results:
- Fine-tuning for a few more rounds, but being cautious of overfitting
- trying other models
- performing named entity recognition on the questions to increase the weight of important entities
- using LLM to rewrite the questions before inputting them into the search

### Experiment 5
