In [1]:
import os
import gradio as gr
from dotenv import load_dotenv
import glob
import numpy as np


In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE  #algorithm for dimensionality reduction
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory     
from langchain.chains import ConversationalRetrievalChain

In [3]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "geminigemini-2.5-flash"
db_name = "vector_db"


In [4]:
load_dotenv(override=True)
os.environ['GEMINI_API_KEY'] = os.getenv('GEMINI_API_KEY')
os.environ['GOOGLE_API_KEY'] = os.getenv('GEMINI_API_KEY')

In [5]:
text_loader_kwargs = {'encoding': 'utf-8'}
# text_loader_kwargs={'autodetect_encoding': True}
loader=DirectoryLoader("cricket_matches", glob="*.md", loader_cls=TextLoader,loader_kwargs=text_loader_kwargs)
documents=loader.load()
for doc in documents:
    print(len(doc.page_content), doc.metadata)  #len is number of characters

26971 {'source': 'cricket_matches\\match1.md'}
27165 {'source': 'cricket_matches\\match2.md'}
26482 {'source': 'cricket_matches\\match3.md'}


In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 
# this 1000 is soft limit not necessery all chunks have size of 1000 you can see in next cell
chunks = text_splitter.split_documents(documents)


Created a chunk of size 1304, which is longer than the specified 1000
Created a chunk of size 1530, which is longer than the specified 1000
Created a chunk of size 1137, which is longer than the specified 1000
Created a chunk of size 1062, which is longer than the specified 1000


In [7]:
print(len(chunks))   # total number of chunks
a=[]
for chunk in chunks:
    a.append(len(chunk.page_content))
print(np.mean(a),  np.min(a), np.max(a))  #average,  min, max

122
666.4344262295082 403 1530


In [9]:
# from google import genai
# import getpass

# # client = genai.Client()

# embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
# # embeddings.embed_query("Hello, world!")   ##how to use the embedding function


# if os.path.exists(db_name):
#     Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# # Create vectorstore
# x=[]

# for i in range(50):
#     x.append(chunks[i])


# vectorstore = Chroma.from_documents(documents=x, embedding=embeddings, persist_directory=db_name)
# print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [10]:
#embeding

import time

# Set API key
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
# # embeddings.embed_query("Hello, world!")   ##how to use the embedding function

# Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Gemini free API has only 100 RPM so we have introduced batch logic and for safer side it is 80
batch_size = 80
total_batches = (len(chunks) + batch_size - 1) // batch_size

for batch_num in range(total_batches):
    start_idx = batch_num * batch_size
    end_idx = start_idx + batch_size
    batch_chunks = chunks[start_idx:end_idx]
    
    if batch_num == 0:
        # First batch - create collection
        vectorstore = Chroma.from_documents(
            documents=batch_chunks, 
            embedding=embeddings, 
            persist_directory=db_name
        )
    else:
        # Subsequent batches - add to collection
        vectorstore.add_documents(documents=batch_chunks)
    
    print(f"✅ Processed batch {batch_num + 1}/{total_batches}")
    time.sleep(70)  # Rate limiting

print(f"Vectorstore created with {vectorstore._collection.count()} documents")

✅ Processed batch 1/2
✅ Processed batch 2/2
Vectorstore created with 122 documents


In [36]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=['embeddings'])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")


The vectors have 3,072 dimensions


In [39]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['source'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red'][['cricket_matches\\match1.md', 'cricket_matches\\match2.md', 'cricket_matches\\match3.md'].index(t)] for t in doc_types]


In [None]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# create a new Chat with OpenAIu
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=1,
    # max_tokens=None,
    # timeout=None,
    max_retries=2,
    
    # other params...
)



{'chat_history': []}
chat_memory=InMemoryChatMessageHistory(messages=[]) return_messages=True memory_key='chat_history'


In [68]:
from langchain_core.callbacks import StdOutCallbackHandler
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 20})

# putting it together: set up the conversation chain with the  gemini-2.5-flash LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory,callbacks=[StdOutCallbackHandler()])

In [71]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]


# chat("Who won the final match?", [])
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.
