In [47]:
# RagPipeline

In [48]:
text = """
Dear Sir/Ma’am,
Thank you for agreeing to provide a technical review of our book “Ultimate Python coding”.
This letter of understanding summarizes our agreement.
As a thank you, we will recognize your contribution by
1. Crediting you as a reviewer in relevant published materials.
2. Sending you a complimentary print copy of the work
We request that you help us promote the book by writing an honest review, and
acknowledging your involvement in the project, on Amazon.com.
In exchange, we request that, for each chapter, you review the materials for accuracy,
relevance, and clarity by inserting comments directly into the documents. There may be
multiple iterations of certain chapters, based on your feedback.
While performing the review, we ask that you:
1. Adhere to the deadlines agreed upon with us
2. Insert detailed, self-explanatory comments (adopt an evidence-based approach)
3. Check for content accuracy, relevance, flow, gaps
4. Provide constructive, practical solutions, where possible
5. Test all instructions and code snippets to ensure they work as described and that the
instructions themselves are clear and direct. Provide screenshots of the final output,
and if the codes don’t work expected, please explain the issue and, if possible, help to
diagnose the problem and recommend a solution.
6. Check that the Q&A questions are accurate and can be answered using the
information in the chapter.
7. Review and sign off all content shared for review within 3 working days, to ensure
that the technical accuracy of the final product meets industry standards and best
practices.
8. Write a comprehensive Summary Comment for each chapter.
9. You need to also ensure that the unpublished manuscript is not accidentally/or
otherwise shared with anyone online or outside the book project team.
10. The Reviewer agrees to enter into a Non-Disclosure agreement till the time of
completing the project as mentioned in the contract or as decided by the publisher.
Reviewer will not share the work, draft manuscript, project idea or details of the
associated members on the project till the work is published with any member who is
not part of the publishing company Green Education Private Limited.
Please sign and date this letter to indicate that you’ve read and understood our terms and
conditions. 
"""

In [49]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [50]:
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=2)
chunks = splitter.split_text(text)


In [51]:
print(chunks)

['Dear Sir/Ma’am,', 'Thank you for agreeing to provide a technical review of our book “Ultimate Python coding”.', 'This letter of understanding summarizes our agreement.', 'As a thank you, we will recognize your contribution by', '1. Crediting you as a reviewer in relevant published materials.', '2. Sending you a complimentary print copy of the work', 'We request that you help us promote the book by writing an honest review, and', 'acknowledging your involvement in the project, on Amazon.com.', 'In exchange, we request that, for each chapter, you review the materials for accuracy,', 'relevance, and clarity by inserting comments directly into the documents. There may be', 'multiple iterations of certain chapters, based on your feedback.', 'While performing the review, we ask that you:\n1. Adhere to the deadlines agreed upon with us', '2. Insert detailed, self-explanatory comments (adopt an evidence-based approach)', '3. Check for content accuracy, relevance, flow, gaps', '4. Provide con

In [52]:
from langchain_huggingface import HuggingFaceEmbeddings

# Generate embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents(chunks)

In [53]:
query = "what is timeline for Review?"

query_embedding = embedding_model.embed_query(query)

In [54]:
import numpy as np

# Reshape query embedding to 2D array
query_embedding = np.array(query_embedding).reshape(1, -1)

In [55]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between query and all KB vectors
similarity_scores = cosine_similarity(query_embedding, embeddings)
print(similarity_scores)
# Sort based on similarity scores (highest to lowest)
# top_k_indices = similarity_scores[0].argsort()[::-1]  

# print(top_k_indices)

[[ 0.03266935  0.13393293  0.15201281  0.1086706   0.40118347  0.22066552
   0.37674001  0.22177393  0.37278278  0.20967309  0.30508238  0.57256416
   0.17492386  0.31051151  0.10461181  0.09613794  0.07856338 -0.00109648
   0.10392228  0.20007482  0.24017078  0.42282005  0.23736496  0.26912292
   0.2545294   0.09558153  0.48765342  0.25978767  0.33585961  0.14699468
   0.07840793  0.22755124  0.19748078]]


In [56]:
# Get the top 3 most relevant chunks
top_k_indices = similarity_scores[0].argsort()[::-1]

# Print the top matching chunks
top_k = 3
for i in top_k_indices[:top_k]:
    print(f"Score: {similarity_scores[0][i]:.4f} -> {chunks[i]}")


Score: 0.5726 -> While performing the review, we ask that you:
1. Adhere to the deadlines agreed upon with us
Score: 0.4877 -> 10. The Reviewer agrees to enter into a Non-Disclosure agreement till the time of
Score: 0.4228 -> 7. Review and sign off all content shared for review within 3 working days, to ensure
