In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login, InferenceClient
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from datasets import load_dataset

In [2]:
# Create a token Huggin Face and save it in your own .env.local file

load_dotenv('.env.local')
token = os.getenv('HF_TOKEN')
login(token=token)

client = InferenceClient(model="meta-llama/Llama-3.2-1B")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Load the dataset from Hugging Face

ds = load_dataset("Falah/story44kids_1_prompts") # You can replace this with any compatible dataset

In [4]:
# Print the first few examples

print(ds['train'][:5])

{'prompts': ['Once upon a time, in a small village nestled on the outskirts of a mystical forest, there lived a poor but content farmer named Ethan. He had a modest cottage and a small plot of land where he grew vegetables to sustain himself. Despite the hardships that came his way, he always wore a smile and greeted everyone with warmth.', "One sunny morning, as Ethan was tending to his crops, he heard a rustling in the bushes nearby. Curiosity piqued, he cautiously approached the sound and discovered a beautiful fox trapped in a hunter's snare. The fox looked at Ethan with pleading eyes, silently asking for help.", 'Without a second thought, Ethan rushed over to free the fox. Using his trusted pocket knife, he carefully cut through the tangled mess until the fox was liberated. Grateful for being saved, the fox introduced herself as Fiona. She explained that she had gotten lost while exploring the depths of the mysterious forest.', "Ethan, being a gentle soul, couldn't leave Fiona alo

In [5]:
# Wrap the text in Document objects

train_texts = [item["prompts"] for item in ds['train']]
documents = [Document(page_content=f"Prompt: {text}") for text in train_texts]


In [6]:
# Split large documents into chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

In [7]:
# Verify the number of document chunks

num_chunks = len(split_documents)
print(f"Number of document chunks: {num_chunks}")

# Print content of the chunks 
for i, doc in enumerate(split_documents):
    print(f"Chunk {i+1}:\n{doc.page_content}\n{'-'*50}")


Number of document chunks: 10
Chunk 1:
Prompt: Once upon a time, in a small village nestled on the outskirts of a mystical forest, there lived a poor but content farmer named Ethan. He had a modest cottage and a small plot of land where he grew vegetables to sustain himself. Despite the hardships that came his way, he always wore a smile and greeted everyone with warmth.
--------------------------------------------------
Chunk 2:
Prompt: One sunny morning, as Ethan was tending to his crops, he heard a rustling in the bushes nearby. Curiosity piqued, he cautiously approached the sound and discovered a beautiful fox trapped in a hunter's snare. The fox looked at Ethan with pleading eyes, silently asking for help.
--------------------------------------------------
Chunk 3:
Prompt: Without a second thought, Ethan rushed over to free the fox. Using his trusted pocket knife, he carefully cut through the tangled mess until the fox was liberated. Grateful for being saved, the fox introduced he

In [8]:
# Embed the documents and initialize Chroma vector store

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = Chroma(embedding_function=embedding_model, persist_directory="./vector_base")
vector_store.add_documents(split_documents)
vector_store.persist()

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  vector_store = Chroma(embedding_function=embedding_model, persist_directory="./vector_base")
  vector_store.persist()


In [10]:
# Check the number of documents stored in the vector store

stored_embeddings = vector_store._collection.count()
print(f"Number of embeddings in the vector store: {stored_embeddings}")


Number of embeddings in the vector store: 10


In [44]:
# Choose the the number of documents you want to retrieve based on your query

def retrieve_documents(query, num_docs=2): # num_docs specifies how many results to return

    retrieved_docs = vector_store.similarity_search(query)  
    retrieved_texts = [doc.page_content for doc in retrieved_docs[:num_docs]]
    print("Retrieved Documents:", retrieved_texts)  
    return retrieved_texts

In [47]:
def ask_query(query):

    retrieved_docs = retrieve_documents(query)
    combined_input = (
        f"Answer the question based on the context provided.\n\n"
        f"Question: {query}\n"
        f"Context:\n"
        + "\n".join(retrieved_docs)  
    )

    response = client.text_generation(combined_input, max_new_tokens=300, temperature=0.7)
    return response

  

In [48]:
# Modify this query to ask different questions

query = "Who is Ethan"
response = ask_query(query)
print(response)

Retrieved Documents: ['Prompt: Overwhelmed with joy, Fiona thanked Ethan for his unwavering support and promised to always cherish their friendship. With newfound confidence and a sense of purpose, she bid farewell to Ethan, disappearing into the embrace of her family.', 'Prompt: Ethan returned to his humble cottage with a heart full of memories and a smile that shone brighter than ever before. He knew that the adventure he embarked upon with Fiona had transformed his perspective on life. The lost forest had gifted him with more than he could have ever imagined – courage, friendship, and a bond that transcended boundaries.']
 Ethan was grateful for the lessons he had learned from Fiona and would always cherish their friendship.
