In [1]:
import logging
from typing import Any, Dict, List, Optional

from llama_index.core.callbacks.base_handler import BaseCallbackHandler
from llama_index.core.callbacks.schema import CBEventType, EventPayload

from aim import Run, Text  # type: ignore[attr-defined]

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)


class AimCallback1(BaseCallbackHandler):
    """
    AimCallback callback class.

    Args:
        repo (:obj:`str`, optional):
            Aim repository path or Repo object to which Run object is bound.
            If skipped, default Repo is used.
        experiment_name (:obj:`str`, optional):
            Sets Run's `experiment` property. 'default' if not specified.
            Can be used later to query runs/sequences.
        system_tracking_interval (:obj:`int`, optional):
            Sets the tracking interval in seconds for system usage
            metrics (CPU, Memory, etc.). Set to `None` to disable
            system metrics tracking.
        log_system_params (:obj:`bool`, optional):
            Enable/Disable logging of system params such as installed packages,
            git info, environment variables, etc.
        capture_terminal_logs (:obj:`bool`, optional):
            Enable/Disable terminal stdout logging.
        event_starts_to_ignore (Optional[List[CBEventType]]):
            list of event types to ignore when tracking event starts.
        event_ends_to_ignore (Optional[List[CBEventType]]):
            list of event types to ignore when tracking event ends.
    """

    def __init__(
        self,
        repo: Optional[str] = None,
        experiment_name: Optional[str] = None,
        system_tracking_interval: Optional[int] = 1,
        log_system_params: Optional[bool] = True,
        capture_terminal_logs: Optional[bool] = True,
        event_starts_to_ignore: Optional[List[CBEventType]] = None,
        event_ends_to_ignore: Optional[List[CBEventType]] = None,
        run_params: Optional[Dict[str, Any]] = None,
    ) -> None:
        if Run is None:
            raise ModuleNotFoundError(
                "Please install aim to use the AimCallback: 'pip install aim'"
            )

        event_starts_to_ignore = (
            event_starts_to_ignore if event_starts_to_ignore else []
        )
        event_ends_to_ignore = event_ends_to_ignore if event_ends_to_ignore else []
        super().__init__(
            event_starts_to_ignore=event_starts_to_ignore,
            event_ends_to_ignore=event_ends_to_ignore,
        )

        self.repo = repo
        llm.experiment_name = experiment_name
        self.system_tracking_interval = system_tracking_interval
        self.log_system_params = log_system_params
        self.capture_terminal_logs = capture_terminal_logs
        self._run: Optional[Any] = None
        self._run_hash = None

        self._llm_response_step = 0

        self.setup(run_params)

    def on_event_start(
        self,
        event_type: CBEventType,
        payload: Optional[Dict[str, Any]] = None,
        event_id: str = "",
        parent_id: str = "",
        **kwargs: Any,
    ) -> str:
        """
        Args:
            event_type (CBEventType): event type to store.
            payload (Optional[Dict[str, Any]]): payload to store.
            event_id (str): event id to store.
            parent_id (str): parent event id.
        """
        return ""

    def on_event_end(
        self,
        event_type: CBEventType,
        payload: Optional[Dict[str, Any]] = None,
        event_id: str = "",
        **kwargs: Any,
    ) -> None:
        """
        Args:
            event_type (CBEventType): event type to store.
            payload (Optional[Dict[str, Any]]): payload to store.
            event_id (str): event id to store.
        """
        if not self._run:
            raise ValueError("AimCallback failed to init properly.")

        print(f'aim end event {event_type}, payload: {payload}')
        if event_type is CBEventType.LLM and payload:
            if EventPayload.PROMPT in payload:
                llm_input = str(payload[EventPayload.PROMPT])
                llm_output = str(payload[EventPayload.COMPLETION])
            else:
                message = payload.get(EventPayload.MESSAGES, [])
                llm_input = "\n".join([str(x) for x in message])
                llm_output = str(payload[EventPayload.RESPONSE])

            self._run.track(
                Text(llm_input),
                name="prompt",
                step=self._llm_response_step,
                context={"event_id": event_id},
            )

            self._run.track(
                Text(llm_output),
                name="response",
                step=self._llm_response_step,
                context={"event_id": event_id},
            )

            self._llm_response_step += 1
        elif event_type is CBEventType.CHUNKING and payload:
            for chunk_id, chunk in enumerate(payload[EventPayload.CHUNKS]):
                self._run.track(
                    Text(chunk),
                    name="chunk",
                    step=self._llm_response_step,
                    context={"chunk_id": chunk_id, "event_id": event_id},
                )

    @property
    def experiment(self) -> Run:
        if not self._run:
            self.setup()
        return self._run

    def setup(self, args: Optional[Dict[str, Any]] = None) -> None:
        if not self._run:
            if self._run_hash:
                self._run = Run(
                    self._run_hash,
                    repo=self.repo,
                    system_tracking_interval=self.system_tracking_interval,
                    log_system_params=self.log_system_params,
                    capture_terminal_logs=self.capture_terminal_logs,
                )
            else:
                self._run = Run(
                    repo=self.repo,
                    experiment=self.experiment_name,
                    system_tracking_interval=self.system_tracking_interval,
                    log_system_params=self.log_system_params,
                    capture_terminal_logs=self.capture_terminal_logs,
                )
                self._run_hash = self._run.hash

        # Log config parameters
        if args:
            try:
                for key in args:
                    self._run.set(key, args[key], strict=False)
            except Exception as e:
                logger.warning(f"Aim could not log config parameters -> {e}")

    def __del__(self) -> None:
        if self._run and self._run.active:
            self._run.close()

    def start_trace(self, trace_id: Optional[str] = None) -> None:
        pass

    def end_trace(
        self,
        trace_id: Optional[str] = None,
        trace_map: Optional[Dict[str, List[str]]] = None,
    ) -> None:
        pass


In [1]:
import sys,os
os.environ["HF_HOME"] = "/home/jeffye/.cache/huggingface_speed"
os.environ["HF_HUB_CACHE"] = "/home/jeffye/.cache/huggingface_hub"


from llama_index.core import Settings
from llama_index.llms.huggingface import HuggingFaceLLM

from llama_index.core.prompts.base import PromptTemplate
import torch
import logging
import sys
from functools import partial

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# os.environ["MODELSCOPE_CACHE"] = "D:/dataset/cache/modelscope/"
# from llama_index.llms.modelscope import ModelScopeLLM
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.core.callbacks import (
    CallbackManager,
    LlamaDebugHandler,
    CBEventType,
)

# for faster loading the model


from llama_index.callbacks.aim import AimCallback

aim_callback = AimCallback(repo="./")
callback_manager = CallbackManager([aim_callback])


# llama_debug = LlamaDebugHandler(print_trace_on_end=True)
# callback_manager = CallbackManager([llama_debug])


system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""
# system_prompt = "you are a very helpful assistant";

# This will wrap the default prompts that are internal to llama-index
# query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
# query_wrapper_prompt = PromptTemplate("{query_str}\n<|im_start|>assistant")
# <|im_start|>assistant
import torch


llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.2, "do_sample": True},
    # system_prompt=system_prompt,
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="Qwen/Qwen-7B-Chat",
    model_name="Qwen/Qwen-7B-Chat",
    device_map="auto",
    # stopping_ids=[50278, 50279, 50277, 1, 0],
    # stopping_ids=[151645, 151644][:1],
    tokenizer_kwargs={"max_length": 4096, "trust_remote_code": True, "add_generation_prompt": True},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16, "trust_remote_code": True},
    callback_manager=callback_manager,
    is_chat_model=True,
)
# very ugly workaround
llm._tokenizer.apply_chat_template = partial(llm._tokenizer.apply_chat_template, add_generation_prompt=True)

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [10]:
# Use Message request
from llama_index.core.base.llms.types import MessageRole, ChatMessage

messages = [
    ChatMessage(
        role=MessageRole.SYSTEM, content="You are a helpful assistant."
    ),
    ChatMessage(role=MessageRole.USER, content="hi"),
]
resp = llm.chat(messages, max_length=4000)
print(resp)
# llm._tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nhi<|im_end|>\n'

In [11]:
from llama_index.core import SummaryIndex
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.llm = llm
# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3")

Settings.embed_model = HuggingFaceEmbedding(model_name="/mnt/d/dataset/cache/huggingface/hub/models--BAAI--bge-m3/snapshots/c400e3d69da76ed1965ea6ed67c83af69c3ff3a4")
docs = SimpleDirectoryReader("./data/").load_data()

# aim_callback = AimCallback(repo="./")
# callback_manager = CallbackManager([aim_callback])
index = SummaryIndex.from_documents(docs, callback_manager=callback_manager)
query_engine = index.as_query_engine()

response = query_engine.query("where did the author do growing up?")
print(response)

In [14]:
print(response)

The article discusses the founder's journey from studying philosophy in college to founding Viaweb, a company that revolutionized the way software was developed and distributed. The founder was inspired by a novel by Robert A. Heinlein and a PBS documentary featuring Terry Winograd's work on the SHRDLU computer program, leading him to become fascinated with artificial intelligence (AI) and begin experimenting with creating a web-based store builder. The founder eventually founded Y Combinator, a startup accelerator that funds and supports early-stage technology companies.



In [15]:
response = query_engine.query("What I Worked On in February 2021")
print(response)

In February 2021, I focused on developing my language skills and expanding my knowledge base. I continued to learn from user interactions and feedback, and worked on improving my ability to generate coherent and relevant responses to user queries. Additionally, I spent time exploring new areas of interest and learning about emerging technologies. Overall, it was a productive month, and I am excited to continue growing and evolving in the future.



In [7]:
run = aim_callback._run


# set training hyperparameters
run['hparams'] = {
    'learning_rate': 0.001,
    'batch_size': 32,
}

# log metric
for i in range(10):
    run.track(i, name='numbers')

In [8]:
import random
import string
from aim import Run, Text

for step in range(100):
    # Generate a random string for this example
    random_str = ''.join(random.choices(
        string.ascii_uppercase +
        string.digits, k=20)
    )
    aim_text = Text(random_str)
    run.track(aim_text, name='mytext1', step=step)

In [None]:
# using huggingface llm: https://docs.llamaindex.ai/en/stable/module_guides/models/llms/usage_custom.html
from llama_index.prompts import PromptTemplate

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

import torch
from llama_index.llms import HuggingFaceLLM

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
    model_name="StabilityAI/stablelm-tuned-alpha-3b",
    # model_name="Qwen-7B-Chat"
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
)

In [None]:
# using huggingface llm: https://docs.llamaindex.ai/en/stable/module_guides/models/llms/usage_custom.html
from llama_index.prompts import PromptTemplate

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

import torch
from llama_index.llms import HuggingFaceLLM

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
    model_name="StabilityAI/stablelm-tuned-alpha-3b",
    # model_name="Qwen-7B-Chat"
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
)

In [None]:
# complete = llm.complete('你是谁')
# complete
from llama_index.llms.base import ChatMessage

messages=[
    ChatMessage(**{"role": "system", "content": "You are a helpful assistant, who can answer all questions in english."}),
    ChatMessage(**{"role": "user", "content": "how do you think of China?"})
  ]

complete = llm.chat(messages=messages)
complete

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
index.storage_context.persist(persist_dir="storage")

In [None]:
# Query your data
from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine

query_engine: RetrieverQueryEngine = index.as_query_engine() # type: ignore
response = query_engine.query("Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.")
response

In [None]:
from llama_index.response.schema import Response


response: Response = query_engine.query("What did the author do growing up?") # type: ignore
print(response)

Empty Response


In [None]:
response

In [None]:
response.source_nodes[1].text


# tutorials/resources

## supported llm frameworks

* https://docs.llamaindex.ai/en/stable/module_guides/models/llms/modules.html#openllm

# SimpleDirectoryReader

## read a png file

In [1]:
from llama_index.core import SimpleDirectoryReader

# SimpleDirectoryReader(input_files=["path/to/file1", "path/to/file2"])
required_exts = [".png", ".txt"]
# docs = SimpleDirectoryReader("data").load_data()
reader = SimpleDirectoryReader(
    input_dir="./data",
    required_exts=required_exts,
    # recursive=True,
)

docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

Loaded 2 docs


In [2]:
from llama_index.core import Settings
Settings

_Settings(_llm=None, _embed_model=None, _callback_manager=None, _tokenizer=None, _node_parser=None, _prompt_helper=None, _transformations=None)

In [3]:
# docs
import sys, os

# os.environ["HTTP_PROXY"] = "http://192.168.1.45:10809"
# os.environ["HTTPS_PROXY"] = "http://192.168.1.45:10809"
# os.environ["HF_ENDPOINT"] = ""

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3")

## HuggingFaceEmbedding compute similarity

* which is the same as using FlagEmbedding directly
* 
```python
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

embeddings_1 = model.encode(sentences_1, 
                            batch_size=12, 
                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )['dense_vecs']
embeddings_2 = model.encode(sentences_2)['dense_vecs']
similarity = embeddings_1 @ embeddings_2.T
print(similarity)
# [[0.6265, 0.3477], [0.3499, 0.678 ]]
```

In [4]:
emb = Settings.embed_model.get_text_embedding('hello world')

In [5]:
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

embeddings_1 = Settings.embed_model.get_text_embedding_batch(sentences_1)
embeddings_2 = Settings.embed_model.get_text_embedding_batch(sentences_2)

In [6]:
for e1 in embeddings_1:
    for e2 in embeddings_2:
        print(Settings.embed_model.similarity(e1, e2))

0.6259036830688345
0.3474958518731192
0.34986774819095623
0.6782461892912981


In [7]:
import qdrant_client
from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.indices import MultiModalVectorStoreIndex

# Create a local Qdrant vector store
client = qdrant_client.QdrantClient(path="qdrant_db")

text_store = QdrantVectorStore(
    client=client, collection_name="text_collection"
)
image_store = QdrantVectorStore(
    client=client, collection_name="image_collection"
)
storage_context = StorageContext.from_defaults(
    vector_store=text_store, image_store=image_store
)

# Create the MultiModal index
# documents = SimpleDirectoryReader("./data_wiki/").load_data()
index = MultiModalVectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
)

In [None]:
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.schema import ImageNode

from PIL import Image
import matplotlib.pyplot as plt
import os


def plot_images(image_paths):
    images_shown = 0
    plt.figure(figsize=(16, 9))
    for img_path in image_paths:
        if os.path.isfile(img_path):
            image = Image.open(img_path)

            plt.subplot(2, 3, images_shown + 1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 9:
                break

test_query = "chinese girl"
test_query = "because YC was doing great. But if there was one thing rarer than Rtm offering advice"

test_query = "In the summer of 2006, Robert and I started working on a new version of Arc. This one was reasonably fast, because it was compiled into Scheme. To test this new Arc, I wrote Hacker News in it. It was originally meant to be a news aggregator for startup founders and was called Startup News, but after a few months I got tired of reading about nothing but startups. Plus it wasn't startup founders we wanted to reach."
# generate  retrieval results
retriever = index.as_retriever(similarity_top_k=3, image_similarity_top_k=5)
retrieval_results = retriever.retrieve(test_query)

retrieved_image = []
for res_node in retrieval_results:
    if isinstance(res_node.node, ImageNode):
        retrieved_image.append(res_node.node.metadata["file_path"])
        print(res_node)
    else:
        display_source_node(res_node, source_length=500)
        pass

plot_images(retrieved_image)

In [None]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image

#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32')

#Encode an image:
img_emb = model.encode(Image.open('data/3.png'))

#Encode text descriptions
text_emb = model.encode(['景甜', 'a nice woman', 'A picture of a girl'])

#Compute cosine similarities 
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores)

Unused or unrecognized kwargs: padding.


tensor([[0.2178, 0.2418, 0.2357]])


In [None]:
#Encode text descriptions
text_emb = model.encode(['shit', 'a nice woman', 'A picture of a girl'])

img_emb = model.encode('-')
#Compute cosine similarities 
cos_scores = util.cos_sim(text_emb, text_emb)
print(cos_scores)

tensor([[1.0000, 0.8843, 0.8626],
        [0.8843, 1.0000, 0.9131],
        [0.8626, 0.9131, 1.0000]])


In [None]:
text_emb.shape

(3, 512)

# environment variables

## model directory

* LLAMA_INDEX_CACHE_DIR : 
* from llama_index.core.utils import get_cache_dir
* C:\Users\73915\AppData\Local\llama_index to control where these files are saved.
* 问题： Huggingface 会单独存一份，不会使用Huaggingface的缓存.

# Trouble shotting

## stopping_ids 如何找

* 如Qwen: 1.  ~/.cache/huggingface/hub/models--Qwen--Qwen-7B-Chat 找到modeling_qwen.py
* modeling_qwen.py 中get_stop_words_ids 方法中可以看到stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]

## LLM template 相关

### template huggingface 介绍

* https://huggingface.co/docs/transformers/main/chat_templating#templates-for-chat-models


### Qwen 模型在 LlamaIndex 的 HuaggingfaceLLM 中无法生产原因

* HuggingfaceLLM 中bugs :  
* 1. formatted=True 设置死了，导致query_wrapper_prompt参数设置无效；
  * def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
        prompt = self.messages_to_prompt(messages)
        completion_response = self.complete(prompt, formatted=True, **kwargs)
        return completion_response_to_chat_response(completion_response)
* 2. messages_to_prompt 默认为_tokenizer_messages_to_prompt, 方法中使用tokenizer的apply_chat_template方法，无法设置add_generation_prompt参数（此处应该由参数is_chat_model控制）
* 解决方案：
  * llm._tokenizer.apply_chat_template = partial(llm._tokenizer.apply_chat_template, add_generation_prompt=True)
  * 但是该方法也把调用写死了

In [None]:
# 待解决问题