In [1]:
!pip install faiss-cpu sentence-transformers



Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [18]:
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

class Retriever:
    def __init__(self, embedding_model='all-MiniLM-L6-v2', chunk_size=512):

        self.chunk_size = chunk_size
        self.model = SentenceTransformer(embedding_model)
        self.index = None
        self.doc_chunks = []

    def _chunk_document(self, document):

        return [document[i:i + self.chunk_size] for i in range(0, len(document), self.chunk_size)]

    def add_documents(self, documents):

        all_chunks = []
        for doc in documents:
            chunks = self._chunk_document(doc)
            all_chunks.extend(chunks)


        embeddings = self.model.encode(all_chunks, convert_to_tensor=False)


        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(np.array(embeddings, dtype=np.float32))

        self.doc_chunks = all_chunks

    def query(self, query, top_k=1):

        query_embedding = self.model.encode([query], convert_to_tensor=False)
        D, I = self.index.search(np.array(query_embedding, dtype=np.float32), top_k)


        return [self.doc_chunks[i] for i in I[0]]

    def save(self, index_path='index.faiss', chunk_file='chunks.txt'):

        faiss.write_index(self.index, index_path)
        with open(chunk_file, 'w') as f:
            for chunk in self.doc_chunks:
                f.write("%s\n" % chunk)

    def load(self, index_path='index.faiss', chunk_file='chunks.txt'):

        self.index = faiss.read_index(index_path)
        with open(chunk_file, 'r') as f:
            self.doc_chunks = f.readlines()


In [19]:
from google.colab import files

def upload_files():
    uploaded = files.upload()
    return uploaded

def load_document(file_path):

    with open(file_path, 'r') as file:
        return file.read()


uploaded_files = upload_files()
document = load_document(next(iter(uploaded_files)))


Saving sample.txt to sample (1).txt


In [20]:

retriever = Retriever(chunk_size=256)
retriever.add_documents([document])


query_1 = "What is machine learning?"
results_1 = retriever.query(query_1, top_k=3)
print("Results for 'What is machine learning?':")
for i, result in enumerate(results_1, 1):
    print(f"Result {i}: {result}\n")

query_2 = "What is Natural Language Processing?"
results_2 = retriever.query(query_2, top_k=3)
print("Results for 'What is Natural Language Processing?':")
for i, result in enumerate(results_2, 1):
    print(f"Result {i}: {result}\n")


Results for 'What is machine learning?':
Result 1: Machine Learning and Natural Language Processing

Machine learning (ML) is a branch of artificial intelligence (AI) that focuses on building systems that can learn from data and improve their performance over time. ML has gained significant attention due t

Result 2: o its wide-ranging applications in fields like healthcare, finance, marketing, and more.

One of the key areas of machine learning is supervised learning, where algorithms learn from labeled data to make predictions. In contrast, unsupervised learning deal

Result 3: s with finding patterns in data without labeled examples.

Natural Language Processing (NLP) is a subfield of AI that focuses on enabling computers to understand and process human language. NLP is used in applications like language translation, sentiment a

Results for 'What is Natural Language Processing?':
Result 1: s with finding patterns in data without labeled examples.

Natural Language Processing (NLP) 

In [21]:

retriever.save()


retriever.load()


In [25]:
import string

def test_retriever():
    test_documents = [
        "Machine learning (ML) is a branch of artificial intelligence that allows systems to learn from data.",
        "Natural language processing (NLP) enables computers to understand and generate human language.",
        "Deep learning is a subset of machine learning, using neural networks to model complex patterns in data."
    ]

    retriever = Retriever(chunk_size=128)
    retriever.add_documents(test_documents)


    query_1 = "What is machine learning?"
    results_1 = retriever.query(query_1, top_k=1)
    print(f"Query: {query_1}")
    print(f"Results: {results_1}")
    assert len(results_1) == 1, "Expected 1 result"
    assert "machine learning" in results_1[0].lower(), "Expected the result to mention 'machine learning'"


    queries = [
        "What is machine learning?",
        "What is natural language processing?",
        "What is deep learning?"
    ]

    for query in queries:

        cleaned_query = query.translate(str.maketrans('', '', string.punctuation))
        results = retriever.query(query, top_k=1)
        print(f"\nQuery: {query}")
        print(f"Results: {results}")
        assert len(results) == 1, f"Expected 1 result for query '{query}'"
        assert cleaned_query.split(' ')[-1].lower() in results[0].lower(), f"Expected result to mention '{cleaned_query.split(' ')[-1]}'"


    query_3 = "What is machine learning?"
    results_3 = retriever.query(query_3, top_k=2)
    print(f"\nQuery: {query_3}")
    print(f"Top 2 Results: {results_3}")
    assert len(results_3) == 2, "Expected 2 results"


    query_4 = "WHAT IS MACHINE LEARNING?"
    results_4 = retriever.query(query_4, top_k=1)
    print(f"\nQuery: {query_4}")
    print(f"Results: {results_4}")
    assert "machine learning" in results_4[0].lower(), "Expected the result to mention 'machine learning'"


    query_5 = "Machine"
    results_5 = retriever.query(query_5, top_k=1)
    print(f"\nQuery: {query_5}")
    print(f"Results: {results_5}")
    assert "machine learning" in results_5[0].lower(), "Expected result to mention 'machine learning'"


    query_6 = "What is quantum computing?"
    results_6 = retriever.query(query_6, top_k=1)
    print(f"\nQuery: {query_6}")
    print(f"Results: {results_6}")
    assert len(results_6) == 1, "Expected 1 result"
    assert "quantum computing" not in results_6[0].lower(), "Expected result not to mention 'quantum computing'"


    query_7 = ""
    if query_7.strip() == "":
        results_7 = []
    else:
        results_7 = retriever.query(query_7, top_k=1)
    print(f"\nQuery: {query_7}")
    print(f"Results: {results_7}")
    assert len(results_7) == 0, "Expected no results for an empty query"

    print("\nAll tests passed successfully!")

test_retriever()


Query: What is machine learning?
Results: ['Machine learning (ML) is a branch of artificial intelligence that allows systems to learn from data.']

Query: What is machine learning?
Results: ['Machine learning (ML) is a branch of artificial intelligence that allows systems to learn from data.']

Query: What is natural language processing?
Results: ['Natural language processing (NLP) enables computers to understand and generate human language.']

Query: What is deep learning?
Results: ['Deep learning is a subset of machine learning, using neural networks to model complex patterns in data.']

Query: What is machine learning?
Top 2 Results: ['Machine learning (ML) is a branch of artificial intelligence that allows systems to learn from data.', 'Deep learning is a subset of machine learning, using neural networks to model complex patterns in data.']

Query: WHAT IS MACHINE LEARNING?
Results: ['Machine learning (ML) is a branch of artificial intelligence that allows systems to learn from dat