In [1]:
import pandas as pd
from langchain.text_splitter import TokenTextSplitter
from tqdm import tqdm
from langchain.text_splitter import CharacterTextSplitter
# from langchain_experimental.text_splitter import SemanticChunker
# from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import NLTKTextSplitter

df = pd.read_csv('PubmedData.csv').drop('Keywords', axis=1)

text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=250)
#text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap=0)

#text_splitter = SemanticChunker(OpenAIEmbeddings())

#text_splitter = NLTKTextSplitter(chunk_size=500)

splitted = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    chunks = text_splitter.split_text(row['Abstract'])
    for i, chunk in enumerate(chunks):
        new_row = row.drop('Abstract') 
        new_row['chunk_id'] = i
        new_row['chunk_text'] = chunk
        splitted.append(new_row)

splitted_df = pd.DataFrame(splitted)
splitted_df.to_csv('splitted_pubmed_data.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 58742/58742 [00:52<00:00, 1121.39it/s]


In [1]:
from FlagEmbedding import FlagModel
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm

model = FlagModel('BAAI/bge-large-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

df = pd.read_csv('splitted_pubmed_data.csv')

index_name = 'abstracts_bge'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

# mapping = {
#     "mappings": {
#         "properties": {
#             "pmid": {
#                 "type": "keyword",  
#             },
#             "title": {
#                 "type": "text",
#                 "analyzer": "standard",  
#             },
#             "vector": {
#                 "type": "knn_vector",  
#                 "dimension": 1024
#             },
#             "publishedDate": {
#                 "type": "date",  # date type for publication date
#             },
#             "authors": {
#                 "type": "text",  # text field for author names
#             },
#             "text_chunk_id": {
#                 "type": "integer",
#             },
#             "arxiv_text": {
#                 "type": "text",
#                 "analyzer": "standard",
#             }
#         }
#     },
# }
# client.indices.create(index_name, body=mapping, ignore=400)

# for _, row in tqdm(df.iterrows(), total=df.shape[0]):
#     pmid = row['PMID']
#     publishedDate = row['PubDate']
#     if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
#         publishedDate = None
#     title = row['ArticleTitle']
#     authors_list = row["Authors"].split(',')
#     authors = [author.strip().lower() for author in authors_list]
#     chunk_id = row['chunk_id']
#     chunk_text = row['chunk_text']
#     embedding = model.encode(chunk_text).tolist()
#     client.index(index_name, {
#         "pmid": pmid,
#         "title": title,
#         "vector": embedding,
#         "publishedDate": publishedDate,
#         "authors": authors,
#         "text_chunk_id": chunk_id,
#         "arxiv_text": chunk_text,
#     })

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import random

df_qa = pd.read_csv('qap.csv').drop('Answer', axis=1)
data_list = df_qa.values.tolist()
random.shuffle(data_list)
data_list = data_list[:1000]
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    embedding = model.encode_queries(query).tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index=index_name, body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:31<00:00,  6.58it/s]

Correct PMID found in top 5 results for 71.89999999999999% of queries.





In [1]:
from FlagEmbedding import LLMEmbedder
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm

model = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)
task = "qa"

df = pd.read_csv('splitted_pubmed_data.csv')

index_name = 'abstracts_llm-embedder'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 768
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    title = row['ArticleTitle']
    authors_list = row["Authors"].split(',')
    authors = [author.strip().lower() for author in authors_list]
    chunk_id = row['chunk_id']
    chunk_text = row['chunk_text']
    embedding = model.encode_keys(chunk_text, task=task).tolist()
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": embedding,
        "publishedDate": publishedDate,
        "authors": authors,
        "text_chunk_id": chunk_id,
        "arxiv_text": chunk_text,
    })

  from .autonotebook import tqdm as notebook_tqdm
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62372/62372 [14:44<00:00, 70.50it/s]


In [2]:
df_qa = pd.read_csv('qap.csv').drop('Answer', axis=1)
data_list = df_qa.values.tolist()
data_list = data_list[:1000]
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    embedding = model.encode_queries(query, task=task).tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index=index_name, body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:06<00:00,  7.90it/s]

Correct PMID found in top 5 results for 64.8% of queries.





In [None]:
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm
from angle_emb import AnglE, Prompts


angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
angle.set_prompt(prompt=Prompts.C)

df = pd.read_csv('splitted_pubmed_data.csv')

index_name = 'abstracts_uae'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 1024
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    title = row['ArticleTitle']
    authors_list = row["Authors"].split(',')
    authors = [author.strip().lower() for author in authors_list]
    chunk_id = row['chunk_id']
    chunk_text = row['chunk_text']
    vecs = angle.encode({'text': chunk_text}, to_numpy=True)
    embedding = vecs[0].tolist()
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": embedding,
        "publishedDate": publishedDate,
        "authors": authors,
        "text_chunk_id": chunk_id,
        "arxiv_text": chunk_text,
    })

In [None]:
import random
df_qa = pd.read_csv('qap.csv').drop('Answer', axis=1)
data_list = df_qa.values.tolist()
random.shuffle(data_list)
data_list = data_list[:1000]
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    vecs = angle.encode({'text': query}, to_numpy=True)
    embedding = vecs[0].tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index=index_name, body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

In [12]:
import pandas as pd

qap = pd.read_csv('qap.csv')
splitted = pd.read_csv('splitted_pubmed_data.csv')

pmid_counts = splitted['PMID'].value_counts()
splitted_filtered = splitted[splitted['PMID'].map(pmid_counts) == 1]

merged_df = qap.merge(splitted_filtered, how='left', left_on='PMID', right_on='PMID')
result_df = merged_df[['Question', 'Answer', 'chunk_text']]
result_df.to_csv('qaa.csv', index=False)

In [20]:
import pandas as pd
from langchain.text_splitter import NLTKTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import voyageai
import os

vo = voyageai.Client(api_key="pa-VidMEe9WZqvQl1nxwnR7PmW4uDGmHia0lzVyX0ftboo")

text_splitter = NLTKTextSplitter(chunk_size=500)

def clean_text(text):
    # 移除换行符和多余的空格
    return ' '.join(text.replace('\n', ' ').split())

input_file = 'qad.csv'
output_file = 'updated_qad.csv'
df = pd.read_csv(input_file)

if os.path.exists(output_file):
    processed_df = pd.read_csv(output_file)
else:
    processed_df = pd.DataFrame(columns=df.columns)

processed_indices = set(processed_df.index)

# 遍历每一行
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
    if index in processed_indices:
        continue
    # 分割chunk_text
    chunks = [clean_text(chunk) for chunk in text_splitter.split_text(row['chunk_text'])]

    # 获取每个chunk的嵌入向量
    chunk_vectors = vo.embed(chunks, model="voyage-lite-01-instruct", input_type="document").embeddings

    # 获取Question的嵌入向量
    question_vector = vo.embed(row['Question'], model="voyage-lite-01-instruct", input_type="document").embeddings[0]

    # 计算余弦相似度
    similarities = [cosine_similarity([question_vector], [chunk_vector])[0][0] for chunk_vector in chunk_vectors]

    # 获取Top 3相似的chunks
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:3]
    top_chunks = [chunks[i] for i in top_indices]

    # 更新DataFrame
    df.at[index, 'chunk_text'] = top_chunks[0] if top_chunks else None
    df.at[index, 'Top2'] = top_chunks[1] if len(top_chunks) > 1 else None
    df.at[index, 'Top3'] = top_chunks[2] if len(top_chunks) > 2 else None

    df.iloc[[index]].to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

Processing rows:  96%|██████████████████████████████████████████████████████████████████████████████████████████▏   | 8576/8940 [00:38<00:04, 88.62it/s]Created a chunk of size 505, which is longer than the specified 500
Processing rows:  97%|██████████████████████████████████████████████████████████████████████████████████████████▉   | 8644/8940 [01:38<00:32,  9.06it/s]Created a chunk of size 665, which is longer than the specified 500
Processing rows:  97%|███████████████████████████████████████████████████████████████████████████████████████████▌  | 8712/8940 [02:38<01:52,  2.03it/s]Created a chunk of size 866, which is longer than the specified 500
Processing rows:  98%|███████████████████████████████████████████████████████████████████████████████████████████▊  | 8733/8940 [02:58<02:13,  1.55it/s]Created a chunk of size 550, which is longer than the specified 500
Processing rows:  98%|████████████████████████████████████████████████████████████████████████████████████████████▎ | 87

In [21]:
import pandas as pd
import random
import json

df = pd.read_csv('updated_qad.csv')

train_data = []

for index, row in df.iterrows():
    query = row['Question']
    pos = [row['chunk_text']]
    neg = random.choice(list(df[df.index != index]['chunk_text']))
    train_data.append({"query": query, "pos": pos, "neg": [neg]})

with open('train_data.jsonl', 'w') as outfile:
    for entry in train_data:
        json.dump(entry, outfile)
        outfile.write('\n')

In [None]:
df = pd.read_csv('splitted_pubmed_data_NLTK.csv')

with open('candidate_pool.jsonl', 'w', encoding='utf-8') as file:
    for text in df['chunk_text']:
        json_obj = json.dumps({"text": text})
        file.write(json_obj + '\n')

python3 -m FlagEmbedding.baai_general_embedding.finetune.hn_mine \
--model_name_or_path BAAI/bge-large-en-v1.5  \
--input_file train_data.jsonl \
--output_file qa_finetune_data_minedHN.jsonl \
--range_for_sampling 50-300 \
--candidate_pool candidate_pool.jsonl \
--use_gpu_for_searching

In [2]:
import json
neg_count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        neg_count += len(data.get('neg', []))
        break

neg_count

15

In [3]:
import json

total_length = 0
count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        query = data.get('query', '')
        total_length += len(query)
        count += 1
average_length = total_length / count if count > 0 else 0

print(f"Average length of 'query': {average_length}")


Average length of 'query': 119.96017897091723


In [4]:
import json

total_length = 0
count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)

        for passage in data.get('pos', []):
            total_length += len(passage)
            count += 1

        for passage in data.get('neg', []):
            total_length += len(passage)
            count += 1

average_length = total_length / count if count > 0 else 0

print(f"Average length of 'passage': {average_length}")


Average length of 'passage': 408.1756431767338


torchrun --nproc_per_node 1 \
-m FlagEmbedding.baai_general_embedding.finetune.run \
--output_dir bge_large_fin \
--model_name_or_path BAAI/bge-large-en-v1.5 \
--train_data qa_finetune_data_minedHN.jsonl \
--learning_rate 1e-5 \
--fp16 \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--dataloader_drop_last True \
--normlized True \
--temperature 0.02 \
--query_max_len 120 \
--passage_max_len 408 \
--train_group_size 5 \
--negatives_cross_device \
--logging_steps 100 \
--query_instruction_for_retrieval "" 

In [4]:
from FlagEmbedding import FlagModel
import pandas as pd
from opensearchpy import OpenSearch
from tqdm import tqdm

model = FlagModel('bge_large_fin', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

df = pd.read_csv('splitted_pubmed_data_NLTK.csv')

index_name = 'abstracts_bge_fin1'

client = OpenSearch(
    hosts=[{"host": 'opensearch', "port": 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

mapping = {
    "mappings": {
        "properties": {
            "pmid": {
                "type": "keyword",  
            },
            "title": {
                "type": "text",
                "analyzer": "standard",  
            },
            "vector": {
                "type": "knn_vector",  
                "dimension": 1024
            },
            "publishedDate": {
                "type": "date",  # date type for publication date
            },
            "authors": {
                "type": "text",  # text field for author names
            },
            "text_chunk_id": {
                "type": "integer",
            },
            "arxiv_text": {
                "type": "text",
                "analyzer": "standard",
            }
        }
    },
}
client.indices.create(index_name, body=mapping, ignore=400)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    pmid = row['PMID']
    publishedDate = row['PubDate']
    if pd.isna(publishedDate) or publishedDate.strip().lower() == 'unknown':
        publishedDate = None
    title = row['ArticleTitle']
    authors_list = row["Authors"].split(',')
    authors = [author.strip().lower() for author in authors_list]
    chunk_id = row['chunk_id']
    chunk_text = row['chunk_text']
    embedding = model.encode(chunk_text).tolist()
    client.index(index_name, {
        "pmid": pmid,
        "title": title,
        "vector": embedding,
        "publishedDate": publishedDate,
        "authors": authors,
        "text_chunk_id": chunk_id,
        "arxiv_text": chunk_text,
    })

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 285186/285186 [1:26:13<00:00, 55.12it/s]


In [6]:
import random

df_qa = pd.read_csv('qap.csv').drop('Answer', axis=1)
data_list = df_qa.values.tolist()
random.shuffle(data_list)
data_list = data_list[:1000]
top_5_hits = 0
for query, correct_pmid in tqdm(data_list):
    embedding = model.encode_queries(query).tolist()
    body = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                }
                }
            }
        }
    }
    res = client.search(index=index_name, body=body)
    search_results_pmids = [hit['_source']['pmid'] for hit in res['hits']['hits']]
    if correct_pmid in search_results_pmids:
        top_5_hits += 1
percentage_top_5 = (top_5_hits / len(data_list)) * 100
print(f"Correct PMID found in top 5 results for {percentage_top_5}% of queries.")

  0%|                                                                                                                          | 0/1000 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [08:53<00:00,  1.87it/s]

Correct PMID found in top 5 results for 75.5% of queries.





In [27]:
from datasets import load_dataset, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from angle_emb import AnglE, AngleDataTokenizer, Prompts
import json

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
angle.set_prompt(prompt=Prompts.C)

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        data = [{'text': json.loads(line)['query'], 
                        'positive': pos, 
                        'negative': neg} 
                for line in lines 
                for pos in json.loads(line)['pos'] 
                for neg in json.loads(line)['neg']]

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)
train_ds = Dataset.from_pandas(pd.DataFrame(train_data))
valid_ds = Dataset.from_pandas(pd.DataFrame(valid_data))
test_ds = Dataset.from_pandas(pd.DataFrame(test_data))

train_ds = train_ds.shuffle().map(AngleDataTokenizer(angle.tokenizer, angle.max_length), num_proc=8)
valid_ds = valid_ds.map(AngleDataTokenizer(angle.tokenizer, angle.max_length), num_proc=8)
test_ds = test_ds.map(AngleDataTokenizer(angle.tokenizer, angle.max_length), num_proc=8)

# 4. fit
angle.fit(
    train_ds=train_ds,
    valid_ds=valid_ds,
    output_dir='ckpts/sts-b',
    batch_size=8,
    epochs=5,
    learning_rate=2e-5,
    save_steps=100,
    eval_steps=1000,
    warmup_steps=0,
    gradient_accumulation_steps=1,
    loss_kwargs={
        'w1': 1.0,
        'w2': 1.0,
        'w3': 1.0,
        'cosine_tau': 20,
        'ibn_tau': 20,
        'angle_tau': 1.0
    },
    fp16=True,
    logging_steps=100
)

# 5. evaluate
corrcoef, accuracy = angle.evaluate(test_ds, device=angle.device)
print('corrcoef:', corrcoef)