In [12]:
import urllib.parse
import logging
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import google.generativeai as genai
from langchain_google_vertexai import VertexAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
import re
import base64
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document
from PyPDF2 import PdfReader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_cohere import CohereEmbeddings
from io import BytesIO

load_dotenv()


True

In [3]:

# Set the logging level to INFO for HTTPX library
logging.getLogger("httpx").setLevel(logging.INFO)
# Set the logging level to INFO for pymongo
logging.getLogger("pymongo").setLevel(logging.INFO)


username = os.getenv('DB_USERNAME')
password = os.getenv('DB_PASSWORD')
db_name=os.getenv('DB_NAME')
encoded_username = urllib.parse.quote_plus(username)
encoded_password = urllib.parse.quote_plus(password)

# MongoDB Atlas connection details
mongo_uri = f"mongodb+srv://{encoded_username}:{encoded_password}@cluster0.w7vw3w2.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Initialize the MongoDB client
client = MongoClient(mongo_uri)
db = client[db_name]
print("MongoDB initialized successfully!")
collection_names = db.list_collection_names()

# Print the collection names
print("Collections in the database:")
for collection_name in collection_names:
    print(collection_name)

MongoDB initialized successfully!
Collections in the database:
semester_6
semester_5
semester_1
semester_4
semester_2
semester_3


In [4]:
google_api_key=os.getenv("GOOGLE_API_KEY2")
COH_API_KEY=os.getenv('COHERE_API_KEY')
genai.configure(api_key=google_api_key)
model = genai.GenerativeModel("gemini-1.5-pro")
llm=VertexAI(model_name='gemini-1.5-flash')

In [5]:
class BinaryDataLoader(BaseLoader):
    def __init__(self, binary_data: bytes, file_name: str):
        self.binary_data = binary_data
        self.file_name = file_name

    def load(self) -> Document:
        # Encode binary data to base64 string
        encoded_data = base64.b64encode(self.binary_data).decode('utf-8')
        
        # Create a Document object
        document = Document(
            page_content=encoded_data,
            metadata={"source": self.file_name}
        )
        
        return document

In [6]:
def parse_query(query):
    # Example query: "Tell me about Quantum Mechanics from semester 3"
    match = re.search(r'about (.+?) from semester (\d+)', query, re.IGNORECASE)
    if not match:
        return None, None
    
    subject = match.group(1).strip()
    semester = match.group(2).strip()
    return semester, subject

def generate_mongo_query(semester, subject):
    collection_name = f"semester_{semester}"
    mongo_query = {
        "file_name": {"$regex": subject, "$options": "i"}
    }
    return collection_name, mongo_query

def process_query(query):
    # Parse the query to get semester and subject
    semester, subject = parse_query(query)
    if not semester or not subject:
        return "Could not parse the query. Please ensure it contains both a semester number and a subject name."
    
    # Generate MongoDB query
    collection_name, mongo_query = generate_mongo_query(semester, subject)
    
    # Execute the MongoDB query
    collection = db[collection_name]
    results = list(collection.find(mongo_query, {"file_name": 1, "file_data": 1}))
    
    if not results:
        return f"No documents found for subject '{subject}' in semester {semester}."
    
    # Extract and load information from the documents
    relevant_info = []
    for result in results:
        file_name = result.get('file_name', 'Unknown')
        file_data = result.get('file_data', b'')

        # Use BinaryDataLoader to process binary data
        loader = BinaryDataLoader(file_data, file_name)
        document = loader.load()
        
        # Debug print statement
        print(f"Loaded document from {file_name} with content length {len(document.page_content)} bytes")

        relevant_info.append(document)
    
    return relevant_info


In [7]:

def get_pdf_text(pdf_docs):
    text = ""
    for pdf_doc in pdf_docs:
        pdf_stream = BytesIO(base64.b64decode(pdf_doc.page_content))
        pdf_reader = PdfReader(pdf_stream)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

def get_vector_store(text_chunks):
    embeddings = CohereEmbeddings(model="embed-english-light-v3.0",cohere_api_key=COH_API_KEY)
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

def get_conversational_chain():
    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n {context}?\n
    Question: \n{question}\n
    Answer:
    """
    llm=VertexAI(model_name='gemini-1.5-flash')
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = LLMChain(llm=llm, prompt=prompt)
    return chain

In [8]:
def user_input(user_question):
    embeddings = CohereEmbeddings(model="embed-english-light-v3.0",cohere_api_key=COH_API_KEY)
    new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    docs = new_db.similarity_search(user_question)
    context = "\n".join([doc.page_content for doc in docs])
    chain = get_conversational_chain()
    response = chain.invoke({"context": context, "question": user_question}, return_only_outputs=True)
    print("\nResponse:", response)
    
    # Adjust the key according to the actual response structure
    return response.get("text", "No output text found")

query = "Tell me about ratio from semester 3"
relevant_info = process_query(query)

if isinstance(relevant_info, list):
    pdf_text = get_pdf_text(relevant_info)
    text_chunks = get_text_chunks(pdf_text)
    get_vector_store(text_chunks)
    user_question = "How to Divide a given number in the given ratio?"
    response = user_input(user_question)
    print("\nReply: ", response)
else:
    print(relevant_info)

Loaded document from UNIT 4 ratio.pdf with content length 1166576 bytes

Response: {'text': 'Let ‘A’ be the given number. The given ratio is a1:a2\nHere ‘A’ is to be divided in the ratio a1: a2.\nIt implies that A is divided in two parts such that value of the first part: value of the second part = a1: a2.\nTherefore, \nFirst part = (a1/ a1+ a2)*A = first term of ratio *(sum of parts / sum of terms of ratio )\nSecond part = (a2/ a1+ a2)*A \n= Second term of ratio *(sum of parts / sum of terms of ratio) \nSince, A has been divided into two parts, so, first part + second part = A. \n'}

Reply:  Let ‘A’ be the given number. The given ratio is a1:a2
Here ‘A’ is to be divided in the ratio a1: a2.
It implies that A is divided in two parts such that value of the first part: value of the second part = a1: a2.
Therefore, 
First part = (a1/ a1+ a2)*A = first term of ratio *(sum of parts / sum of terms of ratio )
Second part = (a2/ a1+ a2)*A 
= Second term of ratio *(sum of parts / sum of terms o

In [9]:
def query_and_ask(query, user_question):
    relevant_info = process_query(query)
    if isinstance(relevant_info, list):
        pdf_text = get_pdf_text(relevant_info)
        text_chunks = get_text_chunks(pdf_text)
        vector_store = get_vector_store(text_chunks)
        response = user_input(user_question)
        print("\nReply: ", response)
    else:
        print(relevant_info)

# Example usage
query = "Tell me about Cloud computing from semester 6"
user_question = "What are the various cloud models?"
query_and_ask(query, user_question)

Loaded document from Cloud Computing 2.pdf with content length 784528 bytes
Loaded document from Cloud Computing.pdf with content length 3053528 bytes

Response: {'text': 'The various cloud models are:\n\n* **Public Cloud:** Resources and services are owned and operated by a cloud service provider (CSP) and made available to the general public over the internet. Infrastructure, platforms, and applications are shared among multiple organizations or users.\n* **Private Cloud:** Cloud resources are used exclusively by a single organization. The infrastructure and services can be owned, managed, and operated by the organization itself (on-premises private cloud) or by a third-party service provider (hosted private cloud).\n* **Community Cloud:** A shared cloud infrastructure and services are tailored to meet the specific requirements of a particular community of organizations. The community may have shared interests, compliance needs, or security concerns. Multiple organizations with simil

In [10]:
query = "Tell me about devops from semester 1"
user_question = "Summarize the keygoals and benefits of devops"
query_and_ask(query, user_question)

Loaded document from Unit 1 DevOps.pdf with content length 1931048 bytes

Response: {'text': '## Key Goals of DevOps:\n\n1. **Effective Collaboration:** DevOps emphasizes shared ownership and streamlined collaboration between development, testing, and deployment teams. This breaks down barriers and accelerates development.\n2. **Scalable Infrastructure:** DevOps aims to create sustainable, highly scalable infrastructure for applications. This ensures apps can handle large traffic volumes and provide a great user experience, adapting to changing demands.\n3. **On-Demand Release Capabilities:** Continuous delivery is crucial, allowing companies to release new features and updates quickly and efficiently. DevOps automates release management for consistency, speed, and predictability.\n4. **Faster Feedback:** Automating tasks like testing and reporting accelerates feedback loops, allowing developers to understand the impact of changes and quickly roll out updates. This improves decision-ma

In [11]:
qn="Agile vs Devops"
query_and_ask(query,qn)

Loaded document from Unit 1 DevOps.pdf with content length 1931048 bytes

Response: {'text': '## Agile vs DevOps\n\nThis document provides a detailed comparison between Agile and DevOps methodologies:\n\n**Agile:**\n\n* **Focus:** Agile is a software development methodology emphasizing iterative development and continuous improvement through collaboration between self-organizing, cross-functional teams and customers.\n* **Origin:** Invented in 2001 by John Kern and Martin Fowler.\n* **Application:** Agile is a method for creating software.\n* **Approach:** An advancement and administration approach focused on consistent changes.\n* **Best Practices:** \n    * Backlog Building\n    * Sprint advancement\n* **Team Structure:** All team members have a wide range of similar skill sets.\n* **Team Size:** Smaller teams are preferred to deliver with fewer complexities.\n* **Time Management:** Short, predetermined timeframes (sprints), typically a week long. \n* **Automation:** Not a primary fo