## Ingestion Pipeline and Token Counting
following topics covered in this notebook

1. Ingestion pipeline in llama_index
2. Transoformation Caching Memory
3. Custom Transformations
4. Tokenizatioan and Token Counting

In [36]:
!pip install llama_index



In [37]:
import os
os.environ["OPENAI_API_KEY"]="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [38]:
import nest_asyncio
nest_asyncio.apply()

In [39]:
!mkdir data
from llama_index.llama_dataset import download_llama_dataset
from llama_index.llama_pack import download_llama_pack
from llama_index import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset(
  "PaulGrahamEssayDataset", "./data"
)

mkdir: cannot create directory ‘data’: File exists


# Ingestion Pipeline

 1. Text splitter
 2. node parser
 3. metadata extractor
 4. embedding model

In [73]:
from llama_index import Document
from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import TitleExtractor
from llama_index.ingestion import IngestionPipeline

In [41]:
pipeline=IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=256,chunk_overlap=128)
    ]
)
nodes=pipeline.run(documents=documents)

In [42]:
len(nodes)

147

In [43]:
nodes[0]

TextNode(id_='beaf5cb8-1b36-44ba-b5ee-cd655435be11', embedding=None, metadata={'file_path': 'data/source_files/source.txt', 'file_name': 'source.txt', 'file_type': 'text/plain', 'file_size': 75084, 'creation_date': '2024-01-27', 'last_modified_date': '2024-01-27', 'last_accessed_date': '2024-01-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f57958a6-6928-4aa2-888d-3d6cc912f282', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data/source_files/source.txt', 'file_name': 'source.txt', 'file_type': 'text/plain', 'file_size': 75084, 'creation_date': '2024-01-27', 'last_modified_date': '2024-01-27', 'last_accessed_date': '2024-01-27'}, hash='59bda86706b709582b593684a2383c8e20b73cfba8789b5fc6

In [44]:
nodes[0].metadata

{'file_path': 'data/source_files/source.txt',
 'file_name': 'source.txt',
 'file_type': 'text/plain',
 'file_size': 75084,
 'creation_date': '2024-01-27',
 'last_modified_date': '2024-01-27',
 'last_accessed_date': '2024-01-27'}

In [45]:
pipeline=IngestionPipeline(
    transformations =[
        SentenceSplitter(chunk_size=256,chunk_overlap=128),
        TitleExtractor()
    ]
)
nodes=pipeline.run(documents=documents)

100%|██████████| 5/5 [00:41<00:00,  8.39s/it]


In [46]:
nodes[0].metadata

{'file_path': 'data/source_files/source.txt',
 'file_name': 'source.txt',
 'file_type': 'text/plain',
 'file_size': 75084,
 'creation_date': '2024-01-27',
 'last_modified_date': '2024-01-27',
 'last_accessed_date': '2024-01-27',
 'document_title': 'Exploring the Evolution of Computing: From Mainframes to Microcomputers, Programming on the IBM 1401, and the Rise of Personal Programming'}

In [47]:
len(nodes)

147

# Let's include Embeddings

In [50]:
pipeline=IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=64,chunk_overlap=20),
        TitleExtractor(),
        OpenAIEmbedding()
    ]
)
nodes=pipeline.run(documents=documents)



  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [00:00<00:02,  1.38it/s][A[A

 40%|████      | 2/5 [00:00<00:01,  2.20it/s][A[AERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-65' coro=<tqdm_asyncio.gather.<locals>.wrap_awaitable() done, defined at /usr/local/lib/python3.10/dist-packages/tqdm/asyncio.py:75> exception=RateLimitError("Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-hhFLm0NHyXq0DYjZNPgBPBSY on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}")>
Traceback (most recent call last):
  File "/usr/lib/python3.10/asyncio/tasks.py", line 232, in __step
    result = coro.send(None)
  File

In [51]:
nodes[0].metadata

{'file_path': 'data/source_files/source.txt',
 'file_name': 'source.txt',
 'file_type': 'text/plain',
 'file_size': 75084,
 'creation_date': '2024-01-27',
 'last_modified_date': '2024-01-27',
 'last_accessed_date': '2024-01-27',
 'document_title': "Exploring the Depths: Unveiling the Journey of Novice Writers and Programmers in the Intriguing World of the School District's 1401"}

In [52]:
nodes[0]

TextNode(id_='5905412c-7d30-475f-b924-7c3f71c9d42f', embedding=[0.02508874423801899, 0.007771904580295086, -0.01519340742379427, -0.018276941031217575, -0.0027962035965174437, 0.041515566408634186, -0.02555127441883087, -0.031087618321180344, -0.02218742109835148, -0.01358856912702322, 0.04375813528895378, 0.03075123205780983, -0.0026227550115436316, -0.007344414480030537, -0.0025649387389421463, -0.0057220556773245335, 0.030246654525399208, -0.0209259744733572, -0.010673228651285172, -0.028690870851278305, -0.004726915620267391, -0.009117445908486843, -0.004775972105562687, -0.012747605331242085, -0.002086640801280737, -0.008556803688406944, 0.01711360737681389, -0.03860022500157356, 0.0005418082582764328, -0.004670851398259401, 0.01834702119231224, -0.017267784103751183, 0.005375158507376909, -0.020631637424230576, -0.01621657982468605, -0.004008592572063208, -0.014800957404077053, -0.027835892513394356, 0.008577827364206314, -0.006366794463247061, 0.028074165806174278, 0.01039991527

# Transformation Caching

In [53]:
from llama_index.ingestion import IngestionCache

In [54]:
pipeline=IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128,chunk_overlap=30),
        TitleExtractor(),
    ]
)
nodes=pipeline.run(documents=documents)

100%|██████████| 5/5 [00:42<00:00,  8.48s/it]


In [55]:
# save and load
pipeline.cache.persist("./llama_cache.json")
new_cache=IngestionCache.from_persist_path("./llama_cache.json")

In [56]:
new_pipeline=IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128,chunk_overlap=30),
        TitleExtractor(),
    ],
    cache=new_cache,
)

In [57]:
nodes[0]

TextNode(id_='117c7157-c66f-4b09-a99d-c0570432a81f', embedding=None, metadata={'file_path': 'data/source_files/source.txt', 'file_name': 'source.txt', 'file_type': 'text/plain', 'file_size': 75084, 'creation_date': '2024-01-27', 'last_modified_date': '2024-01-27', 'last_accessed_date': '2024-01-27', 'document_title': 'Exploring Writing and Programming: My Journey Before College, Early Programming on the IBM 1401, and the Evolution of Programming'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f57958a6-6928-4aa2-888d-3d6cc912f282', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data/source_files/source.txt', 'file_name': 'source.txt', 'file_type': 'text/plain', 'file_size': 75084, 'creation

In [58]:
nodes[0].metadata

{'file_path': 'data/source_files/source.txt',
 'file_name': 'source.txt',
 'file_type': 'text/plain',
 'file_size': 75084,
 'creation_date': '2024-01-27',
 'last_modified_date': '2024-01-27',
 'last_accessed_date': '2024-01-27',
 'document_title': 'Exploring Writing and Programming: My Journey Before College, Early Programming on the IBM 1401, and the Evolution of Programming'}

In [59]:
pipeline=IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128,chunk_overlap=30),
        TitleExtractor(),
        OpenAIEmbedding()
    ],
    cache=new_cache,
)
nodes=pipeline.run(documents=documents)

In [60]:
nodes[0].metadata

{'file_path': 'data/source_files/source.txt',
 'file_name': 'source.txt',
 'file_type': 'text/plain',
 'file_size': 75084,
 'creation_date': '2024-01-27',
 'last_modified_date': '2024-01-27',
 'last_accessed_date': '2024-01-27',
 'document_title': 'Exploring Writing and Programming: My Journey Before College, Early Programming on the IBM 1401, and the Evolution of Programming'}

In [61]:
# save and load
pipeline.cache.persist("./nodes_embeddings.json")
nodes_embedding_cache=IngestionCache.from_persist_path("./nodes_embeddings.json")

In [62]:
pipeline=IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128,chunk_overlap=30),
        TitleExtractor(),
        OpenAIEmbedding()
    ],
    cache=nodes_embedding_cache,
)
# will load it from the cache as the transformations are same
nodes=pipeline.run(documents=documents)

# Custom Transformations

let's include a transformation that removes special characters from the text before generating embeddings

In [63]:
from llama_index.schema import TransformComponent
import re

class TextCleaner(TransformComponent):
  def __call__(self,nodes,**kwargs):
    for node in nodes:
      node.text=re.sub(r'[^0-9A-Za-z ]',"",node.text)
    return nodes
pipeline=IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128,chunk_overlap=30),
        TextCleaner(),
    ],
)
nodes=pipeline.run(documents=documents)

In [64]:
nodes[0]

TextNode(id_='8ada9aa9-4633-432b-b44c-c7e07fe905f5', embedding=None, metadata={'file_path': 'data/source_files/source.txt', 'file_name': 'source.txt', 'file_type': 'text/plain', 'file_size': 75084, 'creation_date': '2024-01-27', 'last_modified_date': '2024-01-27', 'last_accessed_date': '2024-01-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f57958a6-6928-4aa2-888d-3d6cc912f282', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data/source_files/source.txt', 'file_name': 'source.txt', 'file_type': 'text/plain', 'file_size': 75084, 'creation_date': '2024-01-27', 'last_modified_date': '2024-01-27', 'last_accessed_date': '2024-01-27'}, hash='59bda86706b709582b593684a2383c8e20b73cfba8789b5fc6

# Tokenization and Token Counting

In [65]:
import tiktoken
from llama_index.callbacks import CallbackManager, TokenCountingHandler
from llama_index import ServiceContext, set_global_tokenizer

set_global_tokenizer(
    tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

In [66]:
from llama_index import VectorStoreIndex
token_counter=TokenCountingHandler(
    verbose=False,
)
callback_manager=CallbackManager([token_counter])

In [67]:
service_context=ServiceContext.from_defaults(
    callback_manager=callback_manager
)

In [68]:
index=VectorStoreIndex(
    nodes, service_context=service_context
)

# you can access token count directly

In [69]:
token_counter.total_embedding_token_count

18949

#reset token counter

because while querying we dont want to include the embedding token counter

In [70]:
token_counter.reset_counts()

In [71]:
token_counter.total_embedding_token_count

0

In [72]:
response = index.as_query_engine().query("what did the author do growing up")

print(
    "Embedding Tokens",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM prompt Tokens:",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM completion task:",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token count:",
    token_counter.total_llm_token_count,
)

Embedding Tokens 7 
 LLM prompt Tokens: 341 
 LLM completion task: 54 
 Total LLM Token count: 395
