In [8]:
from langchain_community.document_loaders import PyPDFLoader

In [12]:
loader = PyPDFLoader("Employee_Handbook.pdf")
pages = loader.load_and_split()

In [13]:
pages = loader.load_and_split()
pages = pages[4:]  # Skip the first few pages as they are not required
text = "\n".join([doc.page_content for doc in pages])

In [15]:
# we split this text into smaller chunks to make it easier to handle in the chatbot
# we use RecursiveCharacterTextSplitter to split the text into chunks of 500 characters each, with an overlap of 150 characters to ensure continuity.
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)
docs = text_splitter.create_documents([text])
for i, d in enumerate(docs):
    d.metadata = {"doc_id": i}

In [17]:
import google.generativeai as genai
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

True

In [29]:
api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

In [30]:
# Generating Embeddings
# This function takes a a sentence as an arugument and return it's embeddings
def get_embeddings(text):
    # Define the embedding model
    model = 'models/embedding-001'
    # Get the embeddings
    embedding = genai.embed_content(model=model,
                                    content=text,
                                    task_type="retrieval_document")
    return embedding['embedding']

# Get the page_content from the documents and create a new list
content_list = [doc.page_content for doc in docs]
# Send one page_content at a time
embeddings = [get_embeddings(content) for content in content_list]

# Create a dataframe to ingest it to the database
dataframe = pd.DataFrame({
    'page_content': content_list,
    'embeddings': embeddings
})

I0000 00:00:1724280654.591679 74289931 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [31]:
# storing data in MyScaleDB
import clickhouse_connect

client = clickhouse_connect.get_client(
    host='msc-fa078581.us-east-1.aws.myscale.com',
    port=443,
    username='jgajbha_org_default',
    password='passwd_qz4OVgg71J0TrU'
)

In [32]:
# Create a table and insert the data
# Create a table with the name 'handbook'
client.command("""
    CREATE TABLE default.handbook (
        id Int64,
        page_content String,
        embeddings Array(Float32),
        CONSTRAINT check_data_length CHECK length(embeddings) = 768
    ) ENGINE = MergeTree()
    ORDER BY id
""")

# The CONSTRAINT will ensure that the length of each embedding vector is 768

# Insert the data in batches
batch_size = 10
num_batches = len(dataframe) // batch_size
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    batch_data = dataframe[start_idx:end_idx]
    # Insert the data
    client.insert("default.handbook", batch_data.to_records(index=False).tolist(), column_names=batch_data.columns.tolist())
    print(f"Batch {i+1}/{num_batches} inserted.")
# Create a vector index for a quick retrieval of data
client.command("""
ALTER TABLE default.handbook
    ADD VECTOR INDEX vector_index embeddings
    TYPE MSTG
""")

Batch 1/27 inserted.
Batch 2/27 inserted.
Batch 3/27 inserted.
Batch 4/27 inserted.
Batch 5/27 inserted.
Batch 6/27 inserted.
Batch 7/27 inserted.
Batch 8/27 inserted.
Batch 9/27 inserted.
Batch 10/27 inserted.
Batch 11/27 inserted.
Batch 12/27 inserted.
Batch 13/27 inserted.
Batch 14/27 inserted.
Batch 15/27 inserted.
Batch 16/27 inserted.
Batch 17/27 inserted.
Batch 18/27 inserted.
Batch 19/27 inserted.
Batch 20/27 inserted.
Batch 21/27 inserted.
Batch 22/27 inserted.
Batch 23/27 inserted.
Batch 24/27 inserted.
Batch 25/27 inserted.
Batch 26/27 inserted.
Batch 27/27 inserted.


['0', 'chi-msc-fa078581-msc-fa078581-0-0', 'OK', '0', '0']

In [34]:
# retrieving relevant docs
def get_relevant_docs(user_query):
    # Call the get_embeddings function again to convert user query into vector embeddngs
    query_embeddings = get_embeddings(user_query)
    # Make the query
    results = client.query(f"""
        SELECT page_content,
        distance(embeddings, {query_embeddings}) as dist FROM default.handbook ORDER BY dist LIMIT 3
    """)
    relevant_docs = []
    for row in results.named_results():
        relevant_docs.append(row['page_content'])
    return relevant_docs

In [36]:
# generating a response
# using the retrieved docs to generate a response to the user's query
def make_rag_prompt(query, relevant_passage):
    relevant_passage = ' '.join(relevant_passage)
    prompt = (
        f"You are a helpful and informative chatbot that answers questions using text from the reference passage included below. "
        f"Respond in a complete sentence and make sure that your response is easy to understand for everyone. "
        f"Maintain a friendly and conversational tone. If the passage is irrelevant, feel free to ignore it.\n\n"
        f"QUESTION: '{query}'\n"
        f"PASSAGE: '{relevant_passage}'\n\n"
        f"ANSWER:"
    )
    return prompt

import google.generativeai as genai

def generate_response(user_prompt):
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(user_prompt)
    return answer.text

def generate_answer(query):
    relevant_text = get_relevant_docs(query)
    text = " ".join(relevant_text)
    prompt = make_rag_prompt(query, relevant_passage=relevant_text)
    answer = generate_response(prompt)
    return answer
answer = generate_answer(query="what are the office working hours?")
print(answer)

Unfortunately, I cannot answer that question because the office working hours are not mentioned in the provided passage.
