In [1]:
print("Hi")

Hi


In [2]:
%pwd

'c:\\Users\\PC\\OneDrive - KU Leuven\\Project\\Search-Engine\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PC\\OneDrive - KU Leuven\\Project\\Search-Engine'

In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
#Extract Data 
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader 
    )
    documents = loader.load()
    return documents

In [7]:
extracted_data = load_pdf_file(data="Data/")


In [9]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 411


In [12]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [23]:
#Download Embedding Model from Hugging Face
def download_hugging_face_embeddings():
    embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings


In [24]:
embeddings = download_hugging_face_embeddings()

In [25]:
print(embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} query_instruction='Represent this question for searching relevant passages: ' embed_instruction='' show_progress=False


In [26]:
querry_result = embeddings.embed_query("Hello World")
print("Length", len(querry_result))

Length 384


Query Result

In [29]:
from dotenv import load_dotenv
load_dotenv()

True

In [30]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

In [33]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "chatbot"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "chatbot",
    "metric": "cosine",
    "host": "chatbot-i3qy2zb.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [34]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
#Embed each chunk and upsert the embeddings into Pineconde index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [None]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x25fbcfffa50>

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [40]:
retriever_docs = retriever.invoke("What is the role of the human resource function")

In [41]:
retriever_docs

[Document(id='ce45d552-ee1c-4733-981a-ec9890f91959', metadata={'author': 'Jerome Morrissey', 'creationdate': '2018-06-21T11:54:40+03:00', 'creator': 'Microsoft® Word 2016', 'moddate': '2018-06-21T12:00:45+03:00', 'page': 4.0, 'page_label': '5', 'producer': 'Microsoft® Word 2016', 'source': 'Data\\HUMAN_RESOURCE_POLICIES_-_GESCI__June_2018.pdf', 'total_pages': 77.0}, page_content='4. The Human Resource Function \nThe human resource functions shall be managed, on a day-to-day basis by the manager: \nFinance and Operations or another person so designated by the CEO. Such a person will \nalso be known as the Human Resource (HR) Manager. Some human resource functions are \nalso carried out by Managers as, from time to time, delegated to them by the CEO. The CEO \nretains overall responsibility over the human resource functions. \n \n5. The role of the human resource function'),
 Document(id='ce269d3a-b3ff-4bae-ad71-46735597fa37', metadata={'author': 'Jerome Morrissey', 'creationdate': '2018

In [42]:
from langchain_ollama import ChatOllama
llm = ChatOllama(model="gemma3:4b")

In [43]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer the question."
    "Use three sentences maximum and keep the answer concise."
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [44]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [47]:
response = rag_chain.invoke({"input": "What is the role of the human resource function"})
print(response["answer"])

The human resource function is responsible for initiating and developing HR policies and procedures for GESCI. It also facilitates recruitment, hiring, and establishing the organization’s structure, all while monitored by the CEO. The HR Manager, designated by the CEO, oversees these activities.


In [48]:
response = rag_chain.invoke({"input": "What is the car"})
print(response["answer"])

I’m sorry, but the provided context does not contain any information about a “car.” It discusses inventions, travel proposals, and data protection measures within GESCI.
