In [2]:
#!pip install langchain langchain-community 

In [4]:
from langchain.text_splitter import (
  RecursiveCharacterTextSplitter ,
  MarkdownHeaderTextSplitter

)

sample_text = """
# Intro
LangChain helps build LLM apps. Chunking is step one.

## What is chunking?
We split long docs into smaller pieces, often with overlap, to help retrieval.

### Example
This paragraph should stay near its header when possible.
"""


In [8]:
#recursive chunking 

rec_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " "],
    chunk_size=800,
    chunk_overlap=200,
    add_start_index=True,
)

rec_chunks=rec_splitter.create_documents([sample_text])

print("Recursion",len(rec_chunks),"chunks")

for i, c in enumerate(rec_chunks[:2]):
    print(i, len(c.page_content), c.metadata.get("start_index"))

Recursion 1 chunks
0 233 1


In [11]:
# markdown aware (keep headers)

headers=[('#','h1'),("##",'h2'),('###','h3')]

md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers)

md_docs=md_splitter.split_text(sample_text)


In [18]:
md_chunks=rec_splitter.split_documents(md_docs)

print('markdown-aware',len(md_chunks),'chunks')

for i,c in enumerate(md_chunks[:2]):
    print(i,c.metadata)

markdown-aware 3 chunks
0 {'h1': 'Intro', 'start_index': 0}
1 {'h1': 'Intro', 'h2': 'What is chunking?', 'start_index': 0}


Chunk 0: Dogs are loyal animals. They love playing fetch.
Chunk 1: Quantum mechanics studies subatomic particles. It is very complex.


In [27]:
# embedding chunks with GEMINI 

#!pip install google-genai langchain

import numpy as np
from google import genai 
from google.genai import types

client=genai.Client(
    api_key="AIzaSyDy780rpCWpDX7NKz9oInrjr59dxY0iymE"
)

chunks = [
    "LangChain helps build LLM apps. Chunking is step one.",
    "We split long docs into smaller pieces, often with overlap, to help retrieval."
]

#generate embedding for chunks 

result=client.models.embed_content(
    model='gemini-embedding-001',
    contents=chunks,
    config=types.EmbedContentConfig(
        task_type="RETRIEVAL_DOCUMENT",  # since these are docs to index
        output_dimensionality=768
    )
)

embeddings=[np.array(e.values) for e in result.embeddings]

print('got' ,len(embeddings),"embeddings")

print('each embedding length',len(embeddings[0]))

print("first 5 dims of first embedding ",embeddings[0][:5])



got 2 embeddings
each embedding length 768
first 5 dims of first embedding  [-0.00644841  0.01784425  0.00010048 -0.0708848  -0.00857143]
