In [2]:
import json
neg_count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        neg_count += len(data.get('neg', []))
        break

neg_count

15

In [3]:
import json

total_length = 0
count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        query = data.get('query', '')
        total_length += len(query)
        count += 1
average_length = total_length / count if count > 0 else 0

print(f"Average length of 'query': {average_length}")


Average length of 'query': 119.96017897091723


In [4]:
import json

total_length = 0
count = 0

with open('qa_finetune_data_minedHN.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)

        for passage in data.get('pos', []):
            total_length += len(passage)
            count += 1

        for passage in data.get('neg', []):
            total_length += len(passage)
            count += 1

average_length = total_length / count if count > 0 else 0

print(f"Average length of 'passage': {average_length}")


Average length of 'passage': 408.1756431767338


torchrun --nproc_per_node 1 \
-m FlagEmbedding.baai_general_embedding.finetune.run \
--output_dir bge_large_fin \
--model_name_or_path BAAI/bge-large-en-v1.5 \
--train_data qa_finetune_data_minedHN.jsonl \
--learning_rate 1e-5 \
--fp16 \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--dataloader_drop_last True \
--normlized True \
--temperature 0.02 \
--query_max_len 120 \
--passage_max_len 408 \
--train_group_size 5 \
--negatives_cross_device \
--logging_steps 100 \
--query_instruction_for_retrieval "" 

In [9]:
from FlagEmbedding import FlagModel
import pandas as pd
from tqdm import tqdm
from pinecone import Pinecone, ServerlessSpec

model = FlagModel('bge_large_fin', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

df = pd.read_csv('splitted_pubmed_data_NLTK.csv')

pc = Pinecone(api_key="621f7574-8c97-4f46-8c5e-186dd099d33b")

pc.create_index(
    name="bge-fin",
    dimension=1024, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
    ) 
)

index = pc.Index("bge-fin")

batch_size = 64

for start_idx in tqdm(range(0, df.shape[0], batch_size)):
    end_idx = start_idx + batch_size
    batch = df.iloc[start_idx:end_idx]
    vectors_to_upsert = []

    for _, row in batch.iterrows():
        pmid = str(row['PMID'])
        chunk_id = str(row['chunk_id'])
        unique_id = f"{pmid}_{chunk_id}"

        chunk_text = row['chunk_text']
        embedding = model.encode(chunk_text).tolist()

        publishedDate = row['PubDate'] if pd.notna(row['PubDate']) and row['PubDate'].strip().lower() != 'unknown' else ""

        vectors_to_upsert.append({
            "id": unique_id,
            "values": embedding,
            "metadata": {
                "pmid": pmid,
                "title": row['ArticleTitle'],
                "publishedDate": publishedDate,  
                "authors": [author.strip().lower() for author in row["Authors"].split(',')],
                "text_chunk_id": chunk_id,
                "arxiv_text": chunk_text,
            }
        })

    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)


100%|█████████████████████████████████████████████████████████████████████████████| 4457/4457 [2:00:13<00:00,  1.62s/it]
