In [None]:
!pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus llama-index-readers-file python-dotenv
!pip install backports.tarfile
!pip install docling --upgrade
!pip install llama-index-llms-huggingface
!pip install llama-index
!pip install docling

In [None]:
import os
hf_token = "HUGGINGFACE TOKEN HERE"
os.environ["HUGGINGFACEHUB_API_TOKEN"]=hf_token
os.environ["HF_TOKEN"]=hf_token
os.environ['HF_HOME'] = 'YOUR_HOME_DIR/.cache/huggingface/'
os.environ['TRANSFOMERS_CACHE'] = 'YOUR_HOME_DIR/.cache/huggingface/'
import transformers

In [None]:
import os
from pathlib import Path
from tempfile import mkdtemp
from warnings import filterwarnings
from transformers import BitsAndBytesConfig
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch
from llama_index.core import Settings
from transformers import AutoModelForCausalLM, AutoTokenizer
from llama_index.llms.huggingface import HuggingFaceLLM
from peft import PeftModel
from llama_index.core import Settings

EMBED_MODEL = HuggingFaceEmbedding(model_name="abhinand/MedEmbed-large-v0.1")
embed_dim = len(EMBED_MODEL.get_text_embedding("hi"))
Settings.embed_model = EMBED_MODEL
print("Embed dim:", embed_dim)

In [None]:
import datasets

baseDir = "YOUR_HOME_DIR/dataset_cpu_gpu_merge"

dataset = datasets.load_from_disk(baseDir)

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.milvus import MilvusVectorStore
from docling.document_converter import DocumentConverter
from llama_index.node_parser.docling import DoclingNodeParser
from docling.chunking import HybridChunker

SOURCE = r"YOUR_HOME_DIR/guideline_edit.md"

reader = DoclingReader()
node_parser = MarkdownNodeParser()
chunker = HybridChunker()

vector_store = MilvusVectorStore(
    uri=str(Path("YOUR_HOME_DIR/datasets/docling_md_vectordb.db")),
    dim=embed_dim,
    overwrite=True,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=storage_context,
    embed_model=EMBED_MODEL,
)

storage_context.persist(persist_dir="YOUR_HOME_DIR/datasets/persistent_vector_store")

In [None]:
def prepareForRaft():
    for e in dataset:
        query = e["question"]
        
        retriever = index.as_retriever(similarity_top_k=5)
        
        retrieved_docs = retriever.retrieve(query)

        
        sources = [s.get_content(s.metadata) for s in retrieved_docs]
        sourcesStr = "\n\n".join(sources)
        e["sources"] = sources
        newQ = "Question:\n" + query + "\n\nContext:\n" + sourcesStr
        e["input"] = newQ

        if includeAnswerThoughts:
            newAns = "<thoughts>\n" + e["answerThoughts"] + "</thoughts>\n" + e["answer"]
            e["answer"] = newAns

        del e["initialThoughts"]
        del e["isDuplicate"]
        del e["isAnswerable"]
        del e["answerThoughts"]
    
        yield e

In [None]:
from datasets import Dataset
includeAnswerThoughts=False
newDS = Dataset.from_generator(prepareForRaft)
example = newDS[0]
for k in example.keys():
    print(f"{k}:")
    print(example[k])

print("Length documents: ", len(example["sources"]))

In [None]:
from datasets import DatasetDict
import os

seed = 1234

test_valid = newDS.train_test_split(test_size=0.15, seed=seed)

convDSPath = "YOUR_HOME_DIR/datasets/deepseek_noCOT_raft/"

if not os.path.exists(convDSPath):
    os.mkdir(convDSPath)

test_valid['train'].to_parquet(convDSPath + "train.parquet")
test_valid['test'].to_parquet(convDSPath + "val.parquet")

print("Train Length:", len(test_valid['train']))
print("Val Length:", len(test_valid['test']))

In [None]:
vector_store.client.close()