In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [2]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [4]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [5]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

folders = glob.glob("DONT LOOK/*")

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [7]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: TOP SECRET DEFENCE STRATEGY


In [8]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 1 documents


In [9]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [10]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.5, model_name=MODEL)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)

In [11]:
data = {
    "Friend1": {
        "Wall1": "100px",
        "Tank1": "280px",
        "Enemy1": "217px",
        "Enemy2": "206px",
        "Wall2": "300px",
        "Friend2": "200px"
    }
}
query=""

for key, value in data["Friend1"].items():
    query+=(f"- The distance from friend1 to {key} is {value}.")
query+="What is the next best move?"
result = conversation_chain.invoke({"question":query, "chat_history": []})
print(result["answer"])

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Since the distance from friend1 to Wall1 is 100px (which is less than 80px), friend1 should use Wall1 as cover, peeking only when necessary to fire or scout. Additionally, since the tank is 280px away (more than 200px), friend1 should avoid direct confrontation and can use Wall1 for protection while being aware of the enemies at a greater distance.


In [12]:
query

'- The distance from friend1 to Wall1 is 100px.- The distance from friend1 to Tank1 is 280px.- The distance from friend1 to Enemy1 is 217px.- The distance from friend1 to Enemy2 is 206px.- The distance from friend1 to Wall2 is 300px.- The distance from friend1 to Friend2 is 200px.What is the next best move?'