In [1]:
# Ex. 2 : Chunk size Impact on Retrieval

from sentence_transformers import SentenceTransformer , util
import numpy as np
print("Libraries imported successfully!")




  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully!


In [2]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

print("Similarity function ready!")

Similarity function ready!


In [9]:
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bio informatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""


In [6]:

# Function to split text into chunks

def chunk_text(text, chunk_size=100):
    """
    Split text into chunks of fixed word length.

    Args:
        text (str): Input text
        chunk_size (int): Number of words per chunk

    Returns:
        list: List of text chunks
    """
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)

    return chunks

# or

# Function to split text into chunks
def chunk_text(text, chunk_size):
    chunks = []
    start = 0
    while start < len(text):
        chunk = text[start:start+chunk_size].strip()
        if chunk:
            chunks.append(chunk)
        start += chunk_size
    return chunks

In [10]:
# Query
query = "What is machine learning?"


# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Different chunk sizes
chunk_sizes = [100, 200, 400]

for size in chunk_sizes:
    print(f"\n{'='*10} Chunk Size: {size} {'='*10}")
    
    # Chunk document
    chunks = chunk_text(document, size)
    print(f"Number of chunks: {len(chunks)}")


Number of chunks: 16

Number of chunks: 8

Number of chunks: 4


In [11]:
 
    # Create embeddings for chunks
chunk_embeddings = model.encode(chunks)
    
    # Create embedding for query
query_embedding = model.encode([query])[0]

In [12]:
  # Calculate similarity scores
similarities = []
for i, emb in enumerate(chunk_embeddings):
        score = cosine_similarity(query_embedding, emb)
        similarities.append((chunks[i], score))

In [13]:
 # Sort by similarity (descending)
similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
    

In [15]:
# Print top 3 results
print("Top 3 chunks:")
for chunk, score in similarities_sorted[:3]:
        print(f"- Score: {score:.4f}, Chunk: \"{chunk}\"")

Top 3 chunks:
- Score: 0.6687, Chunk: "at focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep"
- Score: 0.6539, Chunk: "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence th"
- Score: 0.4513, Chunk: "learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and con