In [None]:
%pip install llama-index datasets llama-index-callbacks-arize-phoenix llama-index-vector-stores-chroma llama-index-llms-huggingface-api -U -q

# source: https://huggingface.co/agents-course/notebooks/blob/main/unit2/llama-index/components.ipynb

Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset
from pathlib import Path

dataset = load_dataset(path="dvilasuero/finepersonas-v0.1-tiny", split="train")

Path("data").mkdir(parents=True, exist_ok=True)
for i, persona in enumerate(dataset):
    with open(Path("data") / f"persona_{i}.txt", "w") as f:
        f.write(persona["persona"])

README.md:   0%|          | 0.00/618 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/35.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [3]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir="data")
documents = reader.load_data()
len(documents)

5000

In [5]:
from llama_index.embeddings.huggingface_api import HuggingFaceInferenceAPIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(),
        HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ]
)

# run the pipeline sync or async
nodes = await pipeline.arun(documents=documents[:10])
nodes

[TextNode(id_='fb8cc2f8-c014-4c1a-bb98-dee60387e62e', embedding=[-0.046639133244752884, 0.0030708452686667442, 0.03519126772880554, 0.0233150627464056, -0.007796750403940678, -0.02046028897166252, 0.01795903593301773, 0.029326310381293297, -0.04112112894654274, -0.02644956484436989, -0.00405154237523675, -0.02313431352376938, 0.004979528021067381, 0.04973648488521576, -0.021360812708735466, 0.010843319818377495, -0.021283607929944992, 0.07682973146438599, -0.012062274850904942, 0.004425417631864548, 0.0020709980744868517, -0.031683631241321564, 0.09023573249578476, 0.0030828823801130056, -0.010822092182934284, -0.002695392817258835, -0.005925716366618872, -0.002583735156804323, 0.000560278189368546, -0.12084489315748215, -0.04787132889032364, -0.009383059106767178, -0.011417128145694733, 0.005134046543389559, 0.05047130584716797, 0.03615850210189819, -0.0014992408687248826, 0.01519988663494587, -0.00712330499663949, 0.04721373692154884, 0.04252885654568672, 0.010966680012643337, -0.019

In [6]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

db = chromadb.PersistentClient(path="./alfred_chroma_db")
chroma_collection = db.get_or_create_collection(name="alfred")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(),
        HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ],
    vector_store=vector_store,
)

nodes = await pipeline.arun(documents=documents[:10])
len(nodes)

10

In [7]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface_api import HuggingFaceInferenceAPIEmbedding

embed_model = HuggingFaceInferenceAPIEmbedding(model_name="BAAI/bge-small-en-v1.5")
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, embed_model=embed_model
)

In [8]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import nest_asyncio

nest_asyncio.apply()  # This is needed to run the query engine
llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
query_engine = index.as_query_engine(
    llm=llm,
    response_mode="tree_summarize",
)
response = query_engine.query(
    "Respond using a persona that describes author and travel experiences?"
)
response

Response(response='An individual deeply immersed in the study of Cypriot culture, history, and society, this persona has dedicated significant time to research and living in Cyprus. Through extensive fieldwork and personal experience, they have gained a profound understanding of the local customs, traditions, and the way of life of the Cypriot people.', source_nodes=[NodeWithScore(node=TextNode(id_='abb72383-5fb4-44e3-830a-c8ab62258eef', embedding=None, metadata={'file_path': '/Users/kenneth.hamilton/Desktop/agents/llama-index/data/persona_1.txt', 'file_name': 'persona_1.txt', 'file_type': 'text/plain', 'file_size': 266, 'creation_date': '2025-03-19', 'last_modified_date': '2025-03-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1

In [9]:
from llama_index.core.evaluation import FaithfulnessEvaluator

# query index
evaluator = FaithfulnessEvaluator(llm=llm)
eval_result = evaluator.evaluate_response(response=response)
eval_result.passing

False

In [10]:
import llama_index
import os

PHOENIX_API_KEY = "<PHOENIX_API_KEY>"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
llama_index.core.set_global_handler(
    "arize_phoenix", endpoint="https://llamatrace.com/v1/traces"
)

ImportError: Please install Arize Phoenix with `pip install -q arize-phoenix`

In [11]:
response = query_engine.query(
    "What is the name of the someone that is interested in AI and techhnology?"
)
response

Response(response='The provided information does not mention anyone interested in AI and technology. The details given are about an anthropologist or cultural expert focusing on Cypriot culture, and a pulmonologist or respiratory specialist with an interest in educating patients about respiratory health.', source_nodes=[NodeWithScore(node=TextNode(id_='abb72383-5fb4-44e3-830a-c8ab62258eef', embedding=None, metadata={'file_path': '/Users/kenneth.hamilton/Desktop/agents/llama-index/data/persona_1.txt', 'file_name': 'persona_1.txt', 'file_type': 'text/plain', 'file_size': 266, 'creation_date': '2025-03-19', 'last_modified_date': '2025-03-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='78415717-57d5-4bf5-