In [1]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient



In [3]:
# Initialize the MongoDB client
client = MongoClient('mongodb+srv://jana:jr12345@cluster0.2hzth74.mongodb.net/?retryWrites=true&w=majority')
db = client['PSUTBOT']

In [4]:
collection_text_fields = {
    'Clubs Information': ['Club Name', 'Club Description'],
    'Events': ['Event Name', 'Event Description'],
    'Masters Programs': ['Master Program Name', 'Master Pogram Description'],
    'Staff Information': ['Name', 'Position', 'Telephone', 'Email'],
    'Bus Schedule': ['route', 'round'], 
    'Office Hours': ['doctor_name', 'office_hours'],  
    'FAQ': ['Question', 'Answer'],
    'Departments': ['Department Name'],
    'Programs' : ['Program Name', 'Program Description'],
    'Academic Calendar': ['Academic Calendar Information'],
    'study plan': ['major','study plan link']
}

In [5]:
# Define the Document class (if not defined already)
class Document:
    def __init__(self, page_content, metadata, embedding):
        self.page_content = page_content
        self.metadata = metadata
        self.embedding = embedding

    def get_text(self):
        return self.page_content

In [6]:


openai_api_key = ""
openai_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


In [7]:
# Fetch documents (text) from MongoDB collection
documents = []
for collection_name, text_fields in collection_text_fields.items():
    for db_document in db[collection_name].find():
        text_parts = [str(db_document.get(field, "")) for field in text_fields]  # Ensure each element is a string
        text = " ".join(text_parts)  # Combine text from multiple fields
        documents.append(Document(text, {}, None))  # Create Document object with text and empty metadata

# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(documents)

# Extract text from documents
document_texts = [doc.get_text() for doc in documents]

# Embed documents using OpenAIEmbeddings
embeddings = openai_embeddings.embed_documents(document_texts)

# Append embeddings to the documents
for doc, embedding in zip(documents, embeddings):
    doc.embedding = embedding


In [8]:
# Create vectorstore from split documents
vectorstore = Chroma.from_documents(documents=all_splits, embedding=openai_embeddings)

# Create Conversational Retrieval Chain
openai_instance = OpenAI(temperature=0.7, openai_api_key=openai_api_key)
qa = ConversationalRetrievalChain.from_llm(openai_instance, vectorstore.as_retriever())

In [9]:
db2 = Chroma.from_documents(documents=all_splits, embedding=openai_embeddings, persist_directory="./chroma_db")

In [5]:
db3 = Chroma(persist_directory="./chroma_db", embedding_function=openai_embeddings)

In [9]:
docs = db3.similarity_search("what are dr serin atiani's office hours", k=5)
print(docs[0].page_content)

Dr. Serin Atiani Assistant professor  +96265359949 Ex: 5310   s.atiani@psut.edu.jo


In [10]:
print(docs)

[Document(page_content='Dr. Serin Atiani Assistant professor  +96265359949 Ex: 5310   s.atiani@psut.edu.jo'), Document(page_content='Dr. Haitham AL-ani Assistant professor  (06) 535 9949 Ext.5553   h.ani@psut.edu.jo'), Document(page_content='Dr. Hani Ahmad Associate  professor  (06) 535 9949 Ext. 5505 / 5536   h.ahmad@psut.edu.jo'), Document(page_content='Dr. Awos Kanan Head of Computer Engineering DepartmentAssistant professor  (06) 535 9949 Ext.5551   a.kanan@psut.edu.jo'), Document(page_content='Dr. Ahmad A. Tawayha Associate professor  (06) 535 9949 Ext.5523   atawayha@psut.edu.jo')]
