## Functions

In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

1. Calculate number of tokens

In [2]:
import tiktoken
from bs4 import BeautifulSoup as Soup

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


2. Chunking

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size_tok = 2000 
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=chunk_size_tok, chunk_overlap=0
)

3. Raptor

In [4]:
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
#import umap
import umap.umap_ as umap
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from sklearn.mixture import GaussianMixture

RANDOM_SEED = 224  # Fixed seed for reproducibility
'''
全局降维
'''
def global_cluster_embeddings(
    embeddings: np.ndarray,
    dim: int,
    n_neighbors: Optional[int] = None,
    metric: str = "cosine",
) -> np.ndarray:
    """
    Returns:
    - A numpy array of the embeddings reduced to the specified dimensionality.
    """
    # 使用 UMAP 进行全局的维度降维处理。可以理解为将输入的高维嵌入映射到低维空间中。
    # 参数:
    # - embeddings: 输入的嵌入矩阵 (numpy array)。
    # - dim: 目标维度，表示降维后嵌入的维度。
    # - n_neighbors: 邻居的数量，默认为嵌入数量的平方根。
    # - metric: 距离度量方法，默认为"cosine"。
    if n_neighbors is None:
        n_neighbors = int((len(embeddings) - 1) ** 0.5)
    return umap.UMAP(
        n_neighbors=n_neighbors, n_components=dim, metric=metric
    ).fit_transform(embeddings)


'''
局部降维
'''
def local_cluster_embeddings(
    embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine" #initially 50
) -> np.ndarray:
    """
    Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for the reduced space.
    - num_neighbors: The number of neighbors to consider for each point.
    - metric: The distance metric to use for UMAP.

    # 对嵌入执行局部降维，通常在全局聚类之后使用。

    Returns:
    - A numpy array of the embeddings reduced to the specified dimensionality.
    """
    return umap.UMAP(
        n_neighbors=num_neighbors, n_components=dim, metric=metric
    ).fit_transform(embeddings)

'''
找到最佳的聚类数量
'''
def get_optimal_clusters(
    embeddings: np.ndarray, max_clusters: int = 10, random_state: int = RANDOM_SEED
) -> int:
    """
    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - max_clusters: The maximum number of clusters to consider.
    - random_state: Seed for reproducibility.
    # 使用贝叶斯信息准则 (BIC) 通过高斯混合模型 (GMM) 确定最佳聚类数。
    # 该函数使用高斯混合模型（GMM）对嵌入进行聚类，并通过 贝叶斯信息准则（BIC） 来确定最佳的聚类数。
    贝叶斯信息准则（Bayesian Information Criterion, BIC） 是一种统计学指标，用于模型选择。
    它在评估统计模型时，平衡了模型的复杂度和拟合优度，帮助避免过拟合。BIC 的主要思想是：一个好的模型不仅应该很好地拟合数据，还应该尽量简洁。
    Returns:
    - An integer representing the optimal number of clusters found.
    """
    max_clusters = min(max_clusters, len(embeddings))
    n_clusters = np.arange(1, max_clusters)
    bics = []
    for n in n_clusters:
        gm = GaussianMixture(n_components=n, random_state=random_state)
        gm.fit(embeddings)
        bics.append(gm.bic(embeddings)) ## 计算每个模型的 BIC 值
    return n_clusters[np.argmin(bics)]  ## 选择 BIC 最小的聚类数


'''
使用GMM进行聚类
'''
def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0):
    """

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - threshold: The probability threshold for assigning an embedding to a cluster.
    - random_state: Seed for reproducibility.

     # 使用高斯混合模型（GMM）基于概率阈值对嵌入进行聚类。
    Returns:
    - A tuple containing the cluster labels and the number of clusters determined.
    """
    n_clusters = get_optimal_clusters(embeddings)  # 获得最佳聚类数
    gm = GaussianMixture(n_components=n_clusters, random_state=random_state)
    gm.fit(embeddings)
    probs = gm.predict_proba(embeddings) # 获取每个embedding属于每个聚类的概率
    labels = [np.where(prob > threshold)[0] for prob in probs] #基于阈值分配聚类，返回每个嵌入的聚类标签。
    return labels, n_clusters


'''
执行聚类
'''
def perform_clustering(
    embeddings: np.ndarray,
    dim: int,
    threshold: float,
) -> List[np.ndarray]:
    """
    Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering
    using a Gaussian Mixture Model, and finally performing local clustering within each global cluster.

    Parameters:
    - embeddings: The input embeddings as a numpy array.
    - dim: The target dimensionality for UMAP reduction.
    - threshold: The probability threshold for assigning an embedding to a cluster in GMM.


    # 首先对嵌入数据进行全局降维、全局聚类，然后在每个全局聚类内进行局部降维和局部聚类。


    Returns:
    - A list of numpy arrays, where each array contains the cluster IDs for each embedding.
    """
    if len(embeddings) <= dim + 1:
        # Avoid clustering when there's insufficient data
        return [np.array([0]) for _ in range(len(embeddings))]

    # Global dimensionality reduction
    reduced_embeddings_global = global_cluster_embeddings(embeddings, dim)
    # Global clustering
    global_clusters, n_global_clusters = GMM_cluster(
        reduced_embeddings_global, threshold
    )

    all_local_clusters = [np.array([]) for _ in range(len(embeddings))] #空列表，用于存储每个数据点的局部聚类标签
    total_clusters = 0 #当前总聚类数

    # Iterate through each global cluster to perform local clustering
    for i in range(n_global_clusters):  # 遍历每个全局聚类并执行局部聚类
        # Extract embeddings belonging to the current global cluster
        global_cluster_embeddings_ = embeddings[
            np.array([i in gc for gc in global_clusters])
        ] # 找到该全局聚类中的所有数据点

        if len(global_cluster_embeddings_) == 0:
            continue
        if len(global_cluster_embeddings_) <= dim + 1:
            # Handle small clusters with direct assignment
            local_clusters = [np.array([0]) for _ in global_cluster_embeddings_]
            n_local_clusters = 1
        else:
            # 如果数据点足够多，首先调用 local_cluster_embeddings 函数，对这些数据点执行局部降维。
            # 然后再使用 GMM_cluster 进行局部聚类。
            # Local dimensionality reduction and clustering
            reduced_embeddings_local = local_cluster_embeddings(
                global_cluster_embeddings_, dim
            )
            local_clusters, n_local_clusters = GMM_cluster(
                reduced_embeddings_local, threshold
            )

        # Assign local cluster IDs, adjusting for total clusters already processed
        # 为每个局部聚类分配标签
        # 在每个局部聚类内，首先找到局部聚类 j 中的嵌入点。
        # 然后使用 np.where 函数找出原始嵌入中对应的索引位置，并将局部聚类标签 j + total_clusters 赋给这些数据点。
        for j in range(n_local_clusters):
            local_cluster_embeddings_ = global_cluster_embeddings_[
                np.array([j in lc for lc in local_clusters])
            ]
            indices = np.where(
                (embeddings == local_cluster_embeddings_[:, None]).all(-1)
            )[1]
            for idx in indices:
                all_local_clusters[idx] = np.append(
                    all_local_clusters[idx], j + total_clusters
                )

        total_clusters += n_local_clusters

    return all_local_clusters


def embed(texts):
    """
    Generate embeddings for a list of text documents.

    Returns:
    - numpy.ndarray: An array of embeddings for the given text documents.
    """
    text_embeddings = embd.encode(texts)
    text_embeddings_np = np.array(text_embeddings)
    print("text_embeddings_np",text_embeddings_np.shape)
    return text_embeddings_np

'''
生成embedding并聚类
'''
def embed_cluster_texts(texts):
    """
    Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels.

    This function combines embedding generation and clustering into a single step. It assumes the existence
    of a previously defined `perform_clustering` function that performs clustering on the embeddings.

    Parameters:
    - texts: List[str], a list of text documents to be processed.

    Returns:
    - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels.
    """
    text_embeddings_np = embed(texts)  # Generate embeddings
    # 对生成的嵌入进行聚类，降维后的维度为 10，聚类阈值为 0.1。
    cluster_labels = perform_clustering(
        text_embeddings_np, 10, 0.1
    )  # Perform clustering on the embeddings
    df = pd.DataFrame()  # Initialize a DataFrame to store the results
    df["text"] = texts  # Store original texts
    df["embd"] = list(text_embeddings_np)  # Store embeddings as a list in the DataFrame
    df["cluster"] = cluster_labels  # Store cluster labels
    return df

def fmt_txt(df: pd.DataFrame) -> str:
    """
    Formats the text documents in a DataFrame into a single string.

    """
    unique_txt = df["text"].tolist()
    return "--- --- \n --- --- ".join(unique_txt)


#20241111 - We will use llama3 or other methods here instead of gpt4o

'''
生成embedding--->聚类----->生成summary
'''
def embed_cluster_summarize_texts(
    texts: List[str], level: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Parameters:
    - texts: A list of text documents to be processed.
    - level: An integer parameter that could define the depth or detail of processing.

    生成文本嵌入、基于相似性进行聚类，然后对每个聚类中的文本内容生成总结。

    Returns:
    - Tuple containing two DataFrames:
      1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments.
      2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail,
         and the cluster identifiers.
    """

    # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns
    df_clusters = embed_cluster_texts(texts)
    '''
    df = pd.DataFrame()  # Initialize a DataFrame to store the results
    df["text"] = texts  # Store original texts
    df["embd"] = list(text_embeddings_np)  # Store embeddings as a list in the DataFrame
    df["cluster"] = cluster_labels  # Store cluster labels
    '''

    # Prepare to expand the DataFrame for easier manipulation of clusters
    expanded_list = []

    # Expand DataFrame entries to document-cluster pairings for straightforward processing
    # 如果文本属于多个聚类，则将其扩展为多行，每行代表文本与一个聚类的关联。
    for index, row in df_clusters.iterrows():
        for cluster in row["cluster"]:
            expanded_list.append(
                {"text": row["text"], "embd": row["embd"], "cluster": cluster}
            )

    # Create a new DataFrame from the expanded list
    expanded_df = pd.DataFrame(expanded_list)

    # Retrieve unique cluster identifiers for processing
    all_clusters = expanded_df["cluster"].unique()

    print(f"--Generated {len(all_clusters)} clusters--")

#     # Summarization
#     template = """Here is a sub-set of LangChain Expression Language doc.

#     LangChain Expression Language provides a way to compose chain in LangChain.

#     Give a detailed summary of the documentation provided.

#     Documentation:
#     {context}
#     """
    
    #20241111 - modify the prompt to be more about radiology
    
    summaries = []
    for i in all_clusters:
        df_cluster = expanded_df[expanded_df["cluster"] == i]
        formatted_txt = fmt_txt(df_cluster)

        prompt = f"""Here is a sub-set of radiology documentation, which may include information from radiology books, system guides, or research papers.

        This document contains radiology-related content, including imaging techniques, diagnostic criteria, system guidelines, and medical research findings.

        Provide a detailed summary of the information provided, focusing on key concepts, diagnostic methods, and relevant findings in less than 50 words.

        Documentation:
        {formatted_txt}
        
        Summary:
        """
        summaries.append(generate_response(prompt))


    # Create a DataFrame to store summaries with their corresponding cluster and level
    df_summary = pd.DataFrame(
        {
            "summaries": summaries,
            "level": [level] * len(summaries),
            "cluster": list(all_clusters),
        }
    )

    return df_clusters, df_summary


# 在每个递归层次中，对输入的文本执行嵌入、聚类和总结操作，将结果存储下来。
# 如果当前层次的聚类数超过 1，并且还没有达到最大递归深度，则用当前层次的总结文本作为下一层次的输入，继续递归。
def recursive_embed_cluster_summarize(
    texts: List[str], level: int = 1, n_levels: int = 3
) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
    # level：当前递归的层次，初始值为 1。
    # n_levels：最大递归深度，默认是 3。

    """
    Parameters:
    - texts: List[str], texts to be processed.
    - level: int, current recursion level (starts at 1).
    - n_levels: int, maximum depth of recursion.
    Returns:
    - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion
      levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.
    """
    results = {}  # Dictionary to store results at each level

    # Perform embedding, clustering, and summarization for the current level
    df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)

    # Store the results of the current level
    results[level] = (df_clusters, df_summary)

    # Determine if further recursion is possible and meaningful
    unique_clusters = df_summary["cluster"].nunique()
    if level < n_levels and unique_clusters > 1:
        # Use summaries as the input texts for the next level of recursion
        new_texts = df_summary["summaries"].tolist()
        next_level_results = recursive_embed_cluster_summarize(
            new_texts, level + 1, n_levels
        )

        # Merge the results from the next level into the current results dictionary
        results.update(next_level_results)
    # Dict[int, Tuple[pd.DataFrame, pd.DataFrame]] 返回的是一个字典的形式
    return results


def generate_response(prompt):
    torch.cuda.empty_cache()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.3,
        top_p=0.7,
    )
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = response_text.find("Summary:\n")
    return response_text[answer_start + len("Summary:\n"):].strip() if answer_start != -1 else response_text

2024-11-30 14:00:30.893096: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Setup the variables

In [5]:
# text_path = '/home/jupyter/project/VP_storage/extracted_paragraph_text.pkl'
# table_path = '/home/jupyter/project/VP_storage/extracted_table_title.pkl' #set to "" if only text
# figure_path = '/home/jupyter/project/VP_storage/extracted_figure_title.pkl' #set to "" if only text


#text_path = '/home/jupyter/project/paper_text.pkl'
#text_path = '/home/jupyter/project/data_book_text_book_text.pkl'
#text_path = '/home/jupyter/project/data_system_guide_text_system_guide_text.pkl'
table_path = ''
figure_path = '/home/jupyter/project/lea_storage/book_image_caption_coordinated.pkl'


specify_index_title_to_process = "all" #give index number (example : [0 , 1]) or "all"

apply_chunking = False
chunk_size_tok = 2000 

collection_base_name = "radiology_book_figures"

apply_raptor = False #change to false if don't want to apply raptor

embedding_model_for_clustering = 'all-MiniLM-L6-v2' #using sentence transformer function
llm_model_for_summarizing_cluster = "meta-llama/Llama-3.2-1B-Instruct" #will be loaded with quantized framework, alternative : "aaditya/OpenBioLLM-Llama3-8B" or other methods
n_levels = 1 #number of levels for clustering

embedding_model_for_retrieval = 'sentence-transformers/all-MiniLM-L6-v2' #using huggingface embedding function -- to push to milvus


TOKEN = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CLUSTER_ENDPOINT = "https://xxxxxxxxxxxxxxxxxxxxxxxxxx.serverless.gcp-us-west1.cloud.zilliz.com"



## Setup the LLM models and Milvus Database

1. Embedding for clustering

In [6]:
from sentence_transformers import SentenceTransformer

embd = SentenceTransformer(embedding_model_for_clustering)

2. Model LLM for summarization

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

def load_llama_model_quantized(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Define quantization configuration for 8-bit or 4-bit
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,                     
        bnb_4bit_compute_dtype=torch.float16
    )
    
    # Load the model with the specified quantization configuration
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",                    # Automatically allocate layers to GPU/CPU
        torch_dtype=torch.float16,             # Use float16 for reduced memory usage on GPU
        quantization_config=quantization_config,  # Pass the quantization config here
        offload_folder="./VP_storage/offload/" # Folder for offloaded parts if necessary
    )
    
    return tokenizer, model

tokenizer, model = load_llama_model_quantized(llm_model_for_summarizing_cluster ) #change as needed

3. Embedding for retriever

In [8]:
from langchain.vectorstores import Milvus
from langchain.embeddings import HuggingFaceEmbeddings

#embed_model = HuggingFaceEmbeddings(model_name="allenai/biomed_roberta_base")
embed_model = HuggingFaceEmbeddings(model_name=embedding_model_for_retrieval)

  embed_model = HuggingFaceEmbeddings(model_name=embedding_model_for_retrieval)


4. Connect to Zillis Milvus

In [9]:
import os
from pymilvus import (connections, MilvusClient, utility)

connections.connect(
  alias='zillis',
  uri=CLUSTER_ENDPOINT,
  token=TOKEN,
)

## Run the RAG workflow

In [11]:
import pickle


#with open(text_path, 'rb') as f:
#    text = pickle.load(f)


# if table_path != "" :
#     with open(table_path, 'rb') as f:
#         table = pickle.load(f)

#if figure_path != "" :

with open(figure_path, 'rb') as f:
    figure = pickle.load(f)

In [15]:
# if (table_path != "") & (figure_path != "") :
#     df_combined = pd.concat([pd.DataFrame(text), pd.DataFrame(table), pd.DataFrame(figure)], axis=0, join='outer', ignore_index=True)
# elif (table_path == "") & (figure_path != "") :
#     df_combined = pd.concat([pd.DataFrame(text), pd.DataFrame(figure)], axis=0, join='outer', ignore_index=True)
# elif (table_path != "") & (figure_path == "") :
#     df_combined = pd.concat([pd.DataFrame(text), pd.DataFrame(table)], axis=0, join='outer', ignore_index=True)
# else:

df_combined = pd.DataFrame(figure)

list_all_title = df_combined['book_title'].unique()
    
if specify_index_title_to_process == "all" :    
    title_to_process = list_all_title
else : 
    title_to_process = list_all_title[specify_index_title_to_process]


In [19]:
#do in the loop

for title in title_to_process : 
    
    df_combined_subset = df_combined[df_combined['book_title']== title]

    texts = df_combined_subset['text'].tolist()

    concatenated_content = "\n\n\n --- \n\n\n".join(
        [text for text in texts]
    )
    print(
        "Num tokens in all context: %s"
        % num_tokens_from_string(concatenated_content, "cl100k_base")
    )
    
    

    #### 1. Chunking
    
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size_tok, chunk_overlap=0
    )
    texts_split = text_splitter.split_text(concatenated_content) 
    
    if apply_chunking is True :
        texts = texts_split
    
    
    #### 2. RAPTOR (only done if apply_raptor is True)
    if apply_raptor is True : 
        leaf_texts = texts
        import time
        start_time3 = time.time()
        results = recursive_embed_cluster_summarize(leaf_texts, level=1, n_levels=1)
        end_time3 = time.time()

        print(f"Time taken: {end_time3 - start_time3} seconds")
        
        initial_clustering = results[1][0].copy()
        initial_clustering['cluster'] = initial_clustering['cluster'].apply(lambda x: x[0])
        initial_clustering['level'] = 0.0

        # Start with the merged initial DataFrame
        dfs_to_concat = [df_combined_subset.merge(initial_clustering)] # NOTE!!! to replace with df_combined_subset

        # Loop through each level in `results` and collect summary DataFrames
        for level_key in sorted(results.keys()):
            # Check if `results[level_key]` has a second element (assumed to be summaries)
            if results[level_key][0]['text'][0]:  # Check if `results[level_key][1]` exists
                summaries_df = pd.DataFrame(results[level_key][1]).rename(columns={"summaries": "text"})
                dfs_to_concat.append(summaries_df)

        # Concatenate all DataFrames in the list
        final_df = pd.concat(dfs_to_concat, ignore_index=True)
        final_df['book_title'] = final_df['book_title'][0]
        final_df['embd'] = list(embed(final_df['text']))
        
        import pandas as pd

        # Check if 'sequence' column exists, if not, create it
        if 'sequence' not in final_df.columns:
            final_df['sequence'] = ''  

        if 'image_path' not in final_df.columns:
            final_df['image_path']=''
            
        if 'position' not in final_df.columns:
            final_df['position']=''
            
#         import re
#         collection_name = re.sub(r'[- ]', '_', final_df['book_title'][0]) + "_raptor"
        
        import uuid
        from langchain.schema import Document

        docs = []

        final_df["page_num"] = final_df["page_num"].fillna(-1)
        final_df["sequence"] = final_df["sequence"].fillna(-1)
        final_df["position"] = final_df["position"].fillna("")
        final_df["cluster"] = final_df["cluster"].astype("int64")


        for _, row in final_df.iterrows():

            metadata = {
                "uuid": str(uuid.uuid4()), 
                "book_title": row["book_title"],
                "page_num": row["page_num"],
                "cluster": row["cluster"],
                "level": row["level"],
                "sequence": row["sequence"],
                "image_path": row["image_path"]
            }
            doc = Document(page_content=row["text"], metadata=metadata)
            docs.append(doc)
            
        collection_name = collection_base_name + '_raptor'


   
            
    #### 2b. If not applying Raptor        
            
    else :
        final_df = df_combined_subset
        
        if 'sequence' not in final_df.columns:
            final_df['sequence'] = ''  

        if 'image_path' not in final_df.columns:
            final_df['image_path']=''
            
#         import re
#         collection_name = re.sub(r'[- ]', '_', final_df['book_title'][0])
        
        import uuid
        from langchain.schema import Document

        docs = []

        final_df["page_num"] = final_df["page_num"].fillna(-1)
        final_df["sequence"] = final_df["sequence"].fillna(-1)
        final_df["position"] = final_df["position"].fillna("")
        final_df["image_path"] = final_df["image_path"].fillna("")

        for _, row in final_df.iterrows():

            metadata = {
                "uuid": str(uuid.uuid4()), 
                "book_title": row["book_title"],
                "page_num": row["page_num"],
                "sequence": row["sequence"],
                "image_path": row["image_path"]
            }
            doc = Document(page_content=row["text"], metadata=metadata)
            docs.append(doc)
        
        collection_name = collection_base_name

    #### 3. Push to Milvus
    
    vectorstore = Milvus.from_documents(
        documents=docs,
        embedding=embed_model,
        connection_args={"uri": CLUSTER_ENDPOINT, "token":TOKEN},
        consistency_level="Strong",
        collection_name = collection_name,
        index_params={"metric_type": "COSINE", "index_type": "AUTOINDEX", "params": {}}
    )


Num tokens in all context: 59668


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sequence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["page_num"] = final_df["page_num"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["sequence"] = final_df["sequence"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Num tokens in all context: 139929


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sequence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["page_num"] = final_df["page_num"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["sequence"] = final_df["sequence"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Num tokens in all context: 57611


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sequence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["page_num"] = final_df["page_num"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["sequence"] = final_df["sequence"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Num tokens in all context: 76500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sequence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["page_num"] = final_df["page_num"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["sequence"] = final_df["sequence"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

In [20]:
final_df

Unnamed: 0,book_title,page_num,position,text,image_path,sequence
2756,Radiology Illustrated_ Hepatobiliary and Pancr...,6,"(842.519039577908, 860.0380791558159, 1517.716...",Fig. 1.1 Illustrations of normal segmental ana...,./Radiology Illustrated_ Hepatobiliary and Pan...,
2757,Radiology Illustrated_ Hepatobiliary and Pancr...,7,"(842.5167507595486, 765.6958685980903, 1517.71...",Fig. 1.2 Agenesis of the right lobe of the liv...,./Radiology Illustrated_ Hepatobiliary and Pan...,
2758,Radiology Illustrated_ Hepatobiliary and Pancr...,8,"(842.5182766384548, 1261.75537109375, 1517.713...",Fig. 1.3 Agenesis of the left lateral segment ...,./Radiology Illustrated_ Hepatobiliary and Pan...,
2759,Radiology Illustrated_ Hepatobiliary and Pancr...,9,"(842.5194634331597, 1576.3349745008682, 1517.7...",Fig. 1.4 Hypoplasia of the left lateral segmen...,./Radiology Illustrated_ Hepatobiliary and Pan...,
2760,Radiology Illustrated_ Hepatobiliary and Pancr...,10,"(842.518530951606, 766.8392605251736, 1517.774...",Fig. 1.5 Hypoplasia of the left medial segment...,./Radiology Illustrated_ Hepatobiliary and Pan...,
...,...,...,...,...,...,...
3261,Radiology Illustrated_ Hepatobiliary and Pancr...,811,"(1478.4449259440105, 99.48724110921223, 1511.8...",Fig. 26.2 Subcapsular hematoma in a 33-year-o...,,
3262,Radiology Illustrated_ Hepatobiliary and Pancr...,812,"(1300.9661356608074, 99.48724110921223, 1511.7...",Fig. 26.3 Splenic laceration in a 17-year-old ...,,
3263,Radiology Illustrated_ Hepatobiliary and Pancr...,813,"(1477.6989407009548, 99.48724110921223, 1511.8...",Fig. 26.4 Splenic infarction in a four-year-ol...,,
3264,Radiology Illustrated_ Hepatobiliary and Pancr...,815,"(843.4371948242188, 1655.9967041015625, 1517.7...",Fig. 26.6 Splenic rupture in a 35-year-old fem...,,
