# Node Processing

In [1]:
import os

import nest_asyncio
from llama_index.readers import PDFReader
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

from llama_index.extractors import (
    KeywordExtractor,
    EntityExtractor,
    BaseExtractor,
)

from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index import VectorStoreIndex
from llama_index.ingestion import IngestionPipeline
from llama_index.text_splitter import SentenceSplitter

INPUT_PATH = "data/.papers"
PERSIST_PATH = "data/.storage"
COLLECTION_NAME = "quickstart"

QUANT_VERSION = "mistral-7b-instruct-v0.2.Q3_K_S.gguf"
LANGUAGE_MODEL = f"./models/{QUANT_VERSION}"
EMBEDDING_MODEL = "BAAI/bge-small-en"

nest_asyncio.apply()

In [2]:
import chromadb
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.prompts import PromptTemplate

from llama_index import ServiceContext


llm = LlamaCPP(
    model_path=LANGUAGE_MODEL,
    temperature=0.0,
    max_new_tokens=512,
    context_window=3000,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": 30},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=False,
)
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)

# initialize client, setting path to save data
chroma_client = chromadb.PersistentClient(path=PERSIST_PATH)
chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection, llm=None)


storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ./models/mistral-7b-instruct-v0.2.Q3_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:              

## Metadata Creation

In [3]:
files = os.listdir(INPUT_PATH)
loader = PDFReader()

documents = []
for i in files:
    documents.append(loader.load_data(f"{INPUT_PATH}/{i}"))

documents = [c for d in documents for c in d][:3]

print(f"Loaded {len(documents)} documents")

Loaded 3 documents


In [6]:
prompt = """
### [INST] Context: {context_str}. Give a highly concise title that summarizes \
the unique themes found in the context, in no more than 20 words. \
Dont include descriptions of what you are doing, such as this document summarizes. Be as concise as possible. \

Title: [/INST]"""


llm.predict(PromptTemplate(template=prompt), context_str=documents[-2].text)

' "Challenging the Concept of Emergent Abilities in Language Models: A Mathematical Alternative"'

In [7]:
from llama_index.llm_predictor.base import LLMPredictorType
from llama_index.bridge.pydantic import Field
from llama_index.async_utils import run_jobs


class CustomLLMExtractor(BaseExtractor):
    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
    prompt: PromptTemplate = Field(
        default="""[INST] [/INST]""",
        description="The prompt to extract titles with.",
    )

    def __init__(self, llm, prompt):
        super().__init__(llm=llm, prompt=PromptTemplate(template=prompt))

    async def aextract(self, nodes):
        jobs = [self.llm.apredict(self.prompt, context_str=node.text) for node in nodes]
        candidates = await run_jobs(
            jobs, show_progress=self.show_progress, workers=self.num_workers
        )

        return [{"node_title": c.strip(' \t\n\r"')} for c in candidates]


class EntityFlattener(BaseExtractor):
    async def aextract(self, nodes):
        return [
            {"entities": ", ".join(node.metadata.get("entities", []))} for node in nodes
        ]

In [8]:
transformations = [
    SentenceSplitter(chunk_size=512, chunk_overlap=16),
    EntityExtractor(prediction_threshold=0.5),
    EntityFlattener(),
    KeywordExtractor(keywords=10, llm=llm),
    CustomLLMExtractor(llm=llm, prompt=prompt),
]

pipeline = IngestionPipeline(transformations=transformations)
nodes = pipeline.run(documents=documents)

Extracting entities:   0%|          | 0/7 [00:00<?, ?it/s]SpanMarker model predictions are being computed on the CPU while CUDA is available. Moving the model to CUDA using `model.cuda()` before performing predictions is heavily recommended to significantly boost prediction speeds.
Extracting entities:  14%|█▍        | 1/7 [00:04<00:29,  4.83s/it]SpanMarker model predictions are being computed on the CPU while CUDA is available. Moving the model to CUDA using `model.cuda()` before performing predictions is heavily recommended to significantly boost prediction speeds.
Extracting entities:  29%|██▊       | 2/7 [00:08<00:21,  4.40s/it]SpanMarker model predictions are being computed on the CPU while CUDA is available. Moving the model to CUDA using `model.cuda()` before performing predictions is heavily recommended to significantly boost prediction speeds.
Extracting entities:  43%|████▎     | 3/7 [00:12<00:16,  4.19s/it]SpanMarker model predictions are being computed on the CPU while CUDA

In [9]:
for i, n in enumerate(nodes):
    print(n.metadata)

{'page_label': '1', 'file_name': 'data/.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Models_a_Mirage_.pdf', 'entities': 'Rylan Schaeffer, Stanford University, Brando Miranda', 'excerpt_keywords': 'Emergent abilities, large language models, complex systems, Nobel Prize, P.W. Anderson, "More Is Different", complexity, new properties, microscopic details, emergence (in physics), system behavior, model scaling, metrics, nonlinear metrics, discontinuous metrics, linear metrics, continuous metrics, predictability, AI models, BIG-Bench, InstructGPT, GPT-3.', 'node_title': 'Challenging the Reality of Emergent Abilities in Large Language Models: A Metric-Dependent Perspective'}
{'page_label': '1', 'file_name': 'data/.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Models_a_Mirage_.pdf', 'entities': 'LaMDA', 'excerpt_keywords': 'emergence, machine learning, large language models, GPT-3, PaLM, LaMDA, performance improvements, scale, unpredictability, sharp transition.

In [2]:
from llama_index.embeddings import HuggingFaceEmbedding

model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
type(model)

  from .autonotebook import tqdm as notebook_tqdm


llama_index.embeddings.huggingface.HuggingFaceEmbedding

In [10]:
pipeline.persist(PERSIST_PATH)
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

## Create Index

In [12]:
index = VectorStoreIndex(
    nodes, storage_context=storage_context, service_context=service_context
)
query_engine = index.as_query_engine()
query_str = "Does emergence in LLMs really happen and when?"
response = query_engine.query(query_str)
print(str(response))

 Based on the provided context, the text suggests that there have been observations of "emergent abilities" in large language models (LLMs) such as GPT-3, PaLM, and LaMDA. These emergent abilities are defined as capabilities that are not present in smaller-scale models but appear in larger ones. The text also mentions that these abilities can transition seemingly instantaneously from not present to present. However, it's important to note that the existence of emergence in LLMs is still a topic of ongoing research and debate. Some researchers argue that these abilities are not truly emergent but rather an artifact of scale or complexity. The text also mentions an alternative explanation for these abilities being presented as a mathematical model, which aims to quantitatively reproduce the evidence offered in support of emergent abilities. Therefore, while there is evidence suggesting the existence of emergent abilities in LLMs, it's not definitively established and further research is 

In [14]:
# load from disk
db_2 = chromadb.PersistentClient(path=PERSIST_PATH)
chroma_collection_2 = db_2.get_or_create_collection("quickstart")
vector_store_2 = ChromaVectorStore(chroma_collection=chroma_collection_2)
index_2 = VectorStoreIndex.from_vector_store(
    vector_store_2,
    service_context=service_context,
)

query_engine = index_2.as_query_engine()
query_str = "Does emergence in LLMs really happen and when?"
response = query_engine.query(query_str)
print(str(response))

 Based on the provided context, the text suggests that there have been observations of "emergent abilities" in large language models (LLMs) such as GPT-3, PaLM, and LaMDA. These emergent abilities are defined as capabilities that are not present in smaller-scale models but appear in larger ones. The text also mentions that these abilities can transition seemingly instantaneously from not present to present. However, it's important to note that the existence of emergence in LLMs is still a topic of ongoing research and debate. The text also mentions that there are alternative explanations for the observed behaviors of LLMs that should be considered. Therefore, while the text suggests that emergence may occur in LLMs, it does not definitively prove or disprove its existence.
