In [1]:
from FlagEmbedding import FlagModel
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm

model = FlagModel('BAAI/bge-large-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

df = pd.read_csv('splitted_pubmed_data_NLTK.csv')

index_name = 'abstracts_bge'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 1024
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    title = row['ArticleTitle']
    authors_list = row["Authors"].split(',')
    authors = [author.strip().lower() for author in authors_list]
    chunk_id = row['chunk_id']
    chunk_text = row['chunk_text']
    embedding = model.encode(chunk_text).tolist()
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": embedding,
        "publishedDate": publishedDate,
        "authors": authors,
        "text_chunk_id": chunk_id,
        "arxiv_text": chunk_text,
    })

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from FlagEmbedding import LLMEmbedder
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm

model = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)
task = "qa"

df = pd.read_csv('splitted_pubmed_data.csv')

index_name = 'abstracts_llm-embedder'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 768
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    title = row['ArticleTitle']
    authors_list = row["Authors"].split(',')
    authors = [author.strip().lower() for author in authors_list]
    chunk_id = row['chunk_id']
    chunk_text = row['chunk_text']
    embedding = model.encode_keys(chunk_text, task=task).tolist()
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": embedding,
        "publishedDate": publishedDate,
        "authors": authors,
        "text_chunk_id": chunk_id,
        "arxiv_text": chunk_text,
    })

  from .autonotebook import tqdm as notebook_tqdm
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62372/62372 [14:44<00:00, 70.50it/s]


In [None]:
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm
from angle_emb import AnglE, Prompts


angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
angle.set_prompt(prompt=Prompts.C)

df = pd.read_csv('splitted_pubmed_data.csv')

index_name = 'abstracts_uae'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 1024
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    title = row['ArticleTitle']
    authors_list = row["Authors"].split(',')
    authors = [author.strip().lower() for author in authors_list]
    chunk_id = row['chunk_id']
    chunk_text = row['chunk_text']
    vecs = angle.encode({'text': chunk_text}, to_numpy=True)
    embedding = vecs[0].tolist()
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": embedding,
        "publishedDate": publishedDate,
        "authors": authors,
        "text_chunk_id": chunk_id,
        "arxiv_text": chunk_text,
    })

In [None]:
import random
df_qa = pd.read_csv('qap.csv').drop('Answer', axis=1)
data_list = df_qa.values.tolist()
random.shuffle(data_list)
data_list = data_list[:1000]
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    vecs = angle.encode({'text': query}, to_numpy=True)
    embedding = vecs[0].tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index=index_name, body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

In [2]:
import json
neg_count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        neg_count += len(data.get('neg', []))
        break

neg_count

15

In [3]:
import json

total_length = 0
count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        query = data.get('query', '')
        total_length += len(query)
        count += 1
average_length = total_length / count if count > 0 else 0

print(f"Average length of 'query': {average_length}")


Average length of 'query': 119.96017897091723


In [4]:
import json

total_length = 0
count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)

        for passage in data.get('pos', []):
            total_length += len(passage)
            count += 1

        for passage in data.get('neg', []):
            total_length += len(passage)
            count += 1

average_length = total_length / count if count > 0 else 0

print(f"Average length of 'passage': {average_length}")


Average length of 'passage': 408.1756431767338


torchrun --nproc_per_node 1 \
-m FlagEmbedding.baai_general_embedding.finetune.run \
--output_dir bge_large_fin \
--model_name_or_path BAAI/bge-large-en-v1.5 \
--train_data qa_finetune_data_minedHN.jsonl \
--learning_rate 1e-5 \
--fp16 \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--dataloader_drop_last True \
--normlized True \
--temperature 0.02 \
--query_max_len 120 \
--passage_max_len 408 \
--train_group_size 5 \
--negatives_cross_device \
--logging_steps 100 \
--query_instruction_for_retrieval "" 

In [4]:
from FlagEmbedding import FlagModel
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm

model = FlagModel('bge_large_fin', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

df = pd.read_csv('splitted_pubmed_data_NLTK.csv')

index_name = 'abstracts_bge_fin1'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 1024
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    title = row['ArticleTitle']
    authors_list = row["Authors"].split(',')
    authors = [author.strip().lower() for author in authors_list]
    chunk_id = row['chunk_id']
    chunk_text = row['chunk_text']
    embedding = model.encode(chunk_text).tolist()
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": embedding,
        "publishedDate": publishedDate,
        "authors": authors,
        "text_chunk_id": chunk_id,
        "arxiv_text": chunk_text,
    })

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 285186/285186 [1:26:13<00:00, 55.12it/s]


In [27]:
from datasets import load_dataset, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from angle_emb import AnglE, AngleDataTokenizer, Prompts
import json

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
angle.set_prompt(prompt=Prompts.C)

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        data = [{'text': json.loads(line)['query'], 
                        'positive': pos, 
                        'negative': neg} 
                for line in lines 
                for pos in json.loads(line)['pos'] 
                for neg in json.loads(line)['neg']]

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)
train_ds = Dataset.from_pandas(pd.DataFrame(train_data))
valid_ds = Dataset.from_pandas(pd.DataFrame(valid_data))
test_ds = Dataset.from_pandas(pd.DataFrame(test_data))

train_ds = train_ds.shuffle().map(AngleDataTokenizer(angle.tokenizer, angle.max_length), num_proc=8)
valid_ds = valid_ds.map(AngleDataTokenizer(angle.tokenizer, angle.max_length), num_proc=8)
test_ds = test_ds.map(AngleDataTokenizer(angle.tokenizer, angle.max_length), num_proc=8)

# 4. fit
angle.fit(
    train_ds=train_ds,
    valid_ds=valid_ds,
    output_dir='ckpts/sts-b',
    batch_size=8,
    epochs=5,
    learning_rate=2e-5,
    save_steps=100,
    eval_steps=1000,
    warmup_steps=0,
    gradient_accumulation_steps=1,
    loss_kwargs={
        'w1': 1.0,
        'w2': 1.0,
        'w3': 1.0,
        'cosine_tau': 20,
        'ibn_tau': 20,
        'angle_tau': 1.0
    },
    fp16=True,
    logging_steps=100
)

# 5. evaluate
corrcoef, accuracy = angle.evaluate(test_ds, device=angle.device)
print('corrcoef:', corrcoef)

In [9]:
from FlagEmbedding import FlagModel
import pandas as pd
from tqdm import tqdm
from pinecone import Pinecone, ServerlessSpec

model = FlagModel('bge_large_fin', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

df = pd.read_csv('splitted_pubmed_data_NLTK.csv')

pc = Pinecone(api_key="621f7574-8c97-4f46-8c5e-186dd099d33b")

pc.create_index(
    name="bge-fin",
    dimension=1024, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
    ) 
)

index = pc.Index("bge-fin")

batch_size = 64

for start_idx in tqdm(range(0, df.shape[0], batch_size)):
    end_idx = start_idx + batch_size
    batch = df.iloc[start_idx:end_idx]
    vectors_to_upsert = []

    for _, row in batch.iterrows():
        pmid = str(row['PMID'])
        chunk_id = str(row['chunk_id'])
        unique_id = f"{pmid}_{chunk_id}"

        chunk_text = row['chunk_text']
        embedding = model.encode(chunk_text).tolist()

        publishedDate = row['PubDate'] if pd.notna(row['PubDate']) and row['PubDate'].strip().lower() != 'unknown' else ""

        vectors_to_upsert.append({
            "id": unique_id,
            "values": embedding,
            "metadata": {
                "pmid": pmid,
                "title": row['ArticleTitle'],
                "publishedDate": publishedDate,  
                "authors": [author.strip().lower() for author in row["Authors"].split(',')],
                "text_chunk_id": chunk_id,
                "arxiv_text": chunk_text,
            }
        })

    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)


100%|█████████████████████████████████████████████████████████████████████████████| 4457/4457 [2:00:13<00:00,  1.62s/it]


In [15]:
from pinecone import Pinecone
from FlagEmbedding import FlagModel

model = FlagModel('bge_large_fin', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

pc = Pinecone(api_key="621f7574-8c97-4f46-8c5e-186dd099d33b")
index = pc.Index("bge-fin")

def search_arxiv_texts(query):
    
    query_vector = model.encode_queries([query])[0].tolist()
    
    response = index.query(
        vector=query_vector,
        top_k=3,
        include_metadata=True  
    )

    
    arxiv_texts = [match['metadata']['arxiv_text'] for match in response['matches']]

    return arxiv_texts


query = "What was the purpose of the US Food and Drug Administration-cosponsored forum on laser-based imaging?"
top_arxiv_texts = search_arxiv_texts(query)
print(top_arxiv_texts)


['In April 2019, the US Food and Drug Administration, in conjunction with 11 professional ophthalmic, vision science, and optometric societies, convened a forum on laser-based imaging.\n\nThe forum brought together the Food and Drug Administration, clinicians, researchers, industry members, and other stakeholders to stimulate innovation and ensure that patients in the US are the first in the world to have access to high-quality, safe, and effective medical devices.', '"Actions" are treated very generally, from mass protests to votes and other collective decisions, such as, e.g., acceptance (often unconscious) of some societal recommendations.\n\nIn this paper, we concentrate on the theory of laser resonators, physical vs. social.\n\nFor the latter, we analyze in detail the functioning of Internet-based echo chambers.\n\nTheir main purpose is increasing of the power of the quantum information field as well as its coherence.', 'Background & Aims: The United States Food and Drug Administr