In [1]:
from llama_index.core.llama_dataset import (
    LabelledRagDataExample,
    CreatedByType,
    CreatedBy,
)

# constructing a LabelledRagDataExample
query = "This is a test query, is it not?"
query_by = CreatedBy(type=CreatedByType.AI, model_name="gpt-4")
reference_answer = "Yes it is."
reference_answer_by = CreatedBy(type=CreatedByType.HUMAN)
reference_contexts = ["This is a sample context"]

rag_example = LabelledRagDataExample(
    query=query,
    query_by=query_by,
    reference_contexts=reference_contexts,
    reference_answer=reference_answer,
    reference_answer_by=reference_answer_by,
)

  """


In [2]:
print(rag_example.json())

{"query": "This is a test query, is it not?", "query_by": {"model_name": "gpt-4", "type": "ai"}, "reference_contexts": ["This is a sample context"], "reference_answer": "Yes it is.", "reference_answer_by": {"model_name": "", "type": "human"}}


In [3]:
query = "This is a test query, is it so?"
reference_answer = "I think yes, it is."
reference_contexts = ["This is a second sample context"]

rag_example_2 = LabelledRagDataExample(
    query=query,
    query_by=query_by,
    reference_contexts=reference_contexts,
    reference_answer=reference_answer,
    reference_answer_by=reference_answer_by,
)

In [4]:
from llama_index.core.llama_dataset import LabelledRagDataset

rag_dataset = LabelledRagDataset(examples=[rag_example, rag_example_2])

In [5]:
rag_dataset.to_pandas()

Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
0,"This is a test query, is it not?",[This is a sample context],Yes it is.,human,ai (gpt-4)
1,"This is a test query, is it so?",[This is a second sample context],"I think yes, it is.",human,ai (gpt-4)


In [6]:
import nest_asyncio

nest_asyncio.apply()

In [9]:
# Setup models
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# ollama
Settings.llm = Ollama(model="llama3", request_timeout=360.0)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# wikipedia pages
from llama_index.readers.wikipedia import WikipediaReader
from llama_index.core import VectorStoreIndex

cities = [
    "San Francisco",
]

documents = WikipediaReader().load_data(
    pages=[f"History of {x}" for x in cities]
)
index = VectorStoreIndex.from_documents(documents)

In [11]:
query_engine = index.as_query_engine()
response = query_engine.query("What's the population of San Francisco?")
print(response)

I'm happy to help! However, I don't see a specific mention of the current population of San Francisco in the provided context. The text does mention that the city is one of America's most expensive places to live, but it doesn't provide a numerical value for the population. If you're looking for an estimate or recent data on the population of San Francisco, I'd be happy to help you find that information!


In [14]:
# generate questions against chunks
from llama_index.core.llama_dataset.generator import RagDatasetGenerator

# instantiate a DatasetGenerator
dataset_generator = RagDatasetGenerator.from_documents(
    documents,
    llm=Settings.llm,
    num_questions_per_chunk=2,  # set the number of questions per nodes
    show_progress=True,
)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
# since there are 13 nodes, there should be a total of 26 questions
rag_dataset = dataset_generator.generate_dataset_from_nodes()

100%|██████████| 13/13 [01:50<00:00,  8.48s/it]
100%|██████████| 2/2 [00:11<00:00,  5.75s/it]
100%|██████████| 2/2 [00:09<00:00,  4.84s/it]
100%|██████████| 2/2 [00:11<00:00,  5.66s/it]
100%|██████████| 2/2 [00:10<00:00,  5.30s/it]
100%|██████████| 2/2 [00:12<00:00,  6.32s/it]
100%|██████████| 2/2 [00:09<00:00,  4.52s/it]
100%|██████████| 2/2 [00:13<00:00,  6.99s/it]
100%|██████████| 2/2 [00:08<00:00,  4.03s/it]
100%|██████████| 2/2 [00:10<00:00,  5.30s/it]
100%|██████████| 2/2 [00:16<00:00,  8.08s/it]
100%|██████████| 2/2 [00:15<00:00,  7.73s/it]
100%|██████████| 2/2 [00:06<00:00,  3.28s/it]
100%|██████████| 2/2 [00:11<00:00,  5.84s/it]


In [17]:
rag_dataset.to_pandas()

Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
0,Here are two questions I've generated based on...,"[The history of the city of San Francisco, Cal...","Based on the provided context information, her...",ai (llama3),ai (llama3)
1,Question 1:,"[The history of the city of San Francisco, Cal...",What was the earliest evidence of human habita...,ai (llama3),ai (llama3)
2,Here are two questions that cover different as...,[== Arrival of Europeans and early settlement ...,"Based on the provided context information, I'l...",ai (llama3),ai (llama3)
3,**Question 1**,[== Arrival of Europeans and early settlement ...,"Based on the provided context information, I'l...",ai (llama3),ai (llama3)
4,Here are two questions that cover different as...,[== 1848 gold rush ==\n\nThe California gold r...,Here are the answers to the two questions:\n\n...,ai (llama3),ai (llama3)
5,Question 1:,[== 1848 gold rush ==\n\nThe California gold r...,Based on the provided context information abou...,ai (llama3),ai (llama3)
6,Here are two questions that cover different as...,[== Paris of the West ==\n\nIt was during the ...,"Based on the provided context information, I'l...",ai (llama3),ai (llama3)
7,**Question 1:** What was the name of the surge...,[== Paris of the West ==\n\nIt was during the ...,"According to the provided context information,...",ai (llama3),ai (llama3)
8,"Based on the context information, I've generat...",[== Corruption and graft trials ==\n\nMayor Eu...,I'm happy to help! Since you're asking me to g...,ai (llama3),ai (llama3)
9,**Question 1**,[== Corruption and graft trials ==\n\nMayor Eu...,"Based on the provided context information, her...",ai (llama3),ai (llama3)
