In [2]:
!pip install -q chromadb langchain sentence-transformers langchain-google-genai

In [4]:
# !pip show chromadb

In [5]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [6]:
!unzip -q new_articles.zip -d new_articles

In [7]:
import os
# Set your Gemini/Google API key here (replace YOUR_KEY)
os.environ["GOOGLE_API_KEY"] = ""


In [9]:
# !pip install -U langchain-community

In [10]:
# 3. Load documents from directory (text files)
from langchain.document_loaders import DirectoryLoader, TextLoader

In [11]:
### Load the data
loader = DirectoryLoader("/content/new_articles/", glob = "./*.txt", loader_cls= TextLoader)

In [13]:
document = loader.load()

In [15]:
# document

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(document)
print(f"Split into {len(chunks)} chunks")

Split into 233 chunks


In [18]:
# 5. Create embeddings using a Hugging Face model (open-source)
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings  # base class
import numpy as np

In [19]:
class HFEmbeddings(Embeddings):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
    def embed_documents(self, texts):
        # returns list of lists of floats
        return self.model.encode(texts, convert_to_numpy=True).tolist()
    def embed_query(self, text):
        return self.model.encode([text], convert_to_numpy=True)[0].tolist()

In [20]:
# instantiate
hf_embed = HFEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
# 6. Build vector store using Chroma
from langchain.vectorstores import Chroma

persist_directory = "chroma_db"
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=hf_embed,
    persist_directory=persist_directory
)
vectordb.persist()
vectordb = None

  vectordb.persist()


In [22]:
# Load back
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=hf_embed
)

  vectordb = Chroma(


In [23]:
# 7. Create retriever
retriever = vectordb.as_retriever(search_kwargs={"k":3})

In [24]:
# 8. Instantiate Gemini LLM via LangChain Google GenAI integration
from langchain_google_genai import ChatGoogleGenerativeAI

In [27]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.3,
    max_retries=3
    # google_api_key env var already set
)


In [29]:
# !pip install langchain_core

In [32]:
# Check how many documents/chunks are stored
collection = vectordb._collection
print("Number of stored chunks:", collection.count())

# Get first few records (optional)
data = collection.get(limit=3)
for i, doc in enumerate(data["documents"]):
    print(f"\n📄 Document {i+1}:")
    print(doc[:300])  # show first 500 characters


Number of stored chunks: 233

📄 Document 1:
AI startup Hugging Face and ServiceNow Research, ServiceNow’s R&D division, have released StarCoder, a free alternative to code-generating AI systems along the lines of GitHub’s Copilot.

Code-generating systems like DeepMind’s AlphaCode; Amazon’s CodeWhisperer; and OpenAI’s Codex, which powers Copi

📄 Document 2:
According to a study from the University of Cambridge, at least half of developers’ efforts are spent debugging and not actively programming, which costs the software industry an estimated $312 billion per year. But so far, only a handful of code-generating AI systems have been made freely available

📄 Document 3:
Congratulations to all the @BigCodeProject contributors that worked tirelessly over the last 6+ months to bring the vision of releasing a responsibly developed 15B parameter Code LLM to fruition. We cannot thank you enough for the collaboration & contributions to the community. https://t.co/282sCRJq


In [33]:
### Add New datato Database


from langchain_core.documents import Document

# Create a Document object
doc = Document(
    page_content="This is the content of my document.",
    metadata={"source": "my_file.txt", "date": "2023-10-26"}
)

# You can then work with this document, for example, printing its content and metadata
print(f"Page content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")

# Example: adding a new piece of text dynamically
new_text = """
Apple Inc. announced new products and a $2 billion investment in AI research this quarter.
"""
new_doc = Document(page_content=new_text, metadata={"source": "apple_news.txt"})

vectordb.add_documents([new_doc])
vectordb.persist()

print("✅ New document added successfully!")


Page content: This is the content of my document.
Metadata: {'source': 'my_file.txt', 'date': '2023-10-26'}
✅ New document added successfully!


In [34]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})


In [35]:
# --- Imports ---
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

# (Assuming you already have llm and retriever defined)
# llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.0)
# retriever = vectordb.as_retriever(search_kwargs={"k": 2})

# --- Function to Ask Questions from Chroma using Gemini ---
def ask_question_with_context(question):
    # ✅ Retrieve relevant documents (new syntax)
    docs = retriever.invoke(question)

    # Combine the text context
    context = "\n\n".join([doc.page_content for doc in docs])

    # Build the prompt dynamically
    prompt = f"""
    Use the provided context to answer the user's question concisely and accurately.

    Context:
    {context}

    Question:
    {question}

    Answer:
    """

    # Run Gemini model
    response = llm.invoke(prompt)

    # Display answer and sources
    print("🧠 Answer:\n", response.content)
    print("\n📄 Sources:")
    for doc in docs:
        print("-", doc.metadata.get("source", "unknown"))

# --- Try it ---
ask_question_with_context("How much money did Microsoft raise?")


🧠 Answer:
 Microsoft invested around $10 billion in OpenAI, not raised.

📄 Sources:
- /content/new_articles/05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
- /content/new_articles/05-04-microsoft-doubles-down-on-ai-with-new-bing-features.txt
- /content/new_articles/05-07-fintech-space-continues-to-be-competitive-and-drama-filled.txt


In [36]:
# Delete the Entire Collection

In [37]:
vectordb.delete_collection()
vectordb.persist()
print("🗑️ Database cleared successfully!")


🗑️ Database cleared successfully!


In [38]:
!rm -rf chroma_db/
print("🧾 Removed Chroma database folder.")


🧾 Removed Chroma database folder.


In [39]:
# 🧠 5. Check Available Collections

# If you ever store multiple collections (e.g., for different topics):

In [40]:
from chromadb import PersistentClient
client = PersistentClient(path=persist_directory)

print("Available collections:", [c.name for c in client.list_collections()])


Available collections: []
