In [38]:
# Step 1: Define Sample Documents
documents = [
    {"doc_id": "1", "section": "Pay Policies", "content": "Employees are paid bi-weekly via direct deposit."},
    {"section": "Leave of Absence", "content": "Employees must submit a leave request for approval."},
    {"section": "Internet Use", "content": "Company internet must be used for work-related tasks only."},
    {"section": "Internet Use", "content": "Company internet is a broadband internet."},
    {"section": "Break at Work", "content": "Employees can take an hour break."},
    {"section": "Harassment", "content": "Interact with each employee with Respect"}
]


In [39]:
content_corpus = [doc["content"] for doc in documents]
content_corpus

['Employees are paid bi-weekly via direct deposit.',
 'Employees must submit a leave request for approval.',
 'Company internet must be used for work-related tasks only.',
 'Company internet is a broadband internet.',
 'Employees can take an hour break.',
 'Interact with each employee with Respect']

In [40]:
%pip install -q sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
doc_vectors = model.encode(content_corpus)

In [41]:
doc_vectors

array([[ 0.02472514, -0.00908146,  0.0388713 , ...,  0.0196564 ,
         0.04260007, -0.02707142],
       [ 0.03315507,  0.04853379,  0.04736274, ...,  0.10182011,
         0.0915928 ,  0.00358368],
       [-0.07135908, -0.03066469,  0.03183772, ..., -0.04109802,
         0.06524781, -0.00688534],
       [-0.0038374 , -0.0233675 ,  0.0295868 , ..., -0.04415294,
         0.1255909 , -0.03139856],
       [-0.01790445,  0.01495852,  0.08163831, ..., -0.03217232,
        -0.0051365 ,  0.05279535],
       [-0.00240885,  0.03361142, -0.06162645, ...,  0.04830882,
         0.0370764 , -0.01683048]], shape=(6, 384), dtype=float32)

In [42]:
import numpy as np

query = "what is pay policy"
query_vec = model.encode([query])[0]
query_vec

array([-9.93142575e-02,  7.83917159e-02, -1.91631932e-02, -1.35013433e-02,
        3.35368328e-02,  7.50690773e-02,  1.25523984e-01, -8.12248141e-02,
       -2.87164766e-02,  9.72976014e-02,  3.89284007e-02, -5.21367532e-04,
       -5.80282090e-03, -2.61153523e-02, -5.44103794e-03, -1.84842981e-02,
       -5.57704158e-02, -9.45026055e-04,  8.38830695e-03, -1.20196566e-02,
       -1.50373951e-02,  2.08471958e-02, -2.42591724e-02,  1.67974737e-03,
        3.31865102e-02,  1.49716083e-02,  3.65174487e-02,  6.63311481e-02,
       -7.08122626e-02,  4.37518507e-02, -3.65411956e-03, -5.33398688e-02,
       -2.53989100e-02, -4.34408449e-02, -5.87593392e-02,  1.73844658e-02,
       -6.19511716e-02, -1.00720316e-01, -1.25066623e-01,  2.36004163e-02,
       -1.74311642e-02, -2.10653879e-02, -4.01067361e-03,  1.59821510e-02,
        1.29315862e-02,  3.23585607e-02,  1.00352988e-01,  3.93128656e-02,
       -5.74018806e-02,  1.14079220e-02,  6.57656640e-02, -1.16041247e-02,
        1.28015622e-01,  

In [43]:
similarities = model.similarity(query_vec, doc_vectors)

# Ensure it's a 1D numpy array
similarities = np.asarray(similarities).squeeze()
similarities

array([0.41270226, 0.12613022, 0.12804194, 0.15716514, 0.14682405,
       0.15005696], dtype=float32)

In [44]:
top_3_indices = np.argsort(similarities)[::-1][:3]
print(top_3_indices)
top_scores = similarities[top_3_indices]
top_scores

[0 3 5]


array([0.41270226, 0.15716514, 0.15005696], dtype=float32)

In [45]:
top_scores

array([0.41270226, 0.15716514, 0.15005696], dtype=float32)

In [46]:
top_docs = [documents[i]['content'] for i in top_3_indices]
# documents = [
#     {"section": "Pay Policies", "content": "Employees are paid bi-weekly via direct deposit."},
#     {"section": "Leave of Absence", "content": "Employees must submit a leave request for approval."},
#     {"section": "Internet Use", "content": "Company internet must be used for work-related tasks only."},
#     {"section": "Break at Work", "content": "Employees can take an hour break."},
#     {"section": "Harassment", "content": "Interact with each employee with Respect"}
# ]

print (top_docs)
context = "\n---\n".join(top_docs)
context

['Employees are paid bi-weekly via direct deposit.', 'Company internet is a broadband internet.', 'Interact with each employee with Respect']


'Employees are paid bi-weekly via direct deposit.\n---\nCompany internet is a broadband internet.\n---\nInteract with each employee with Respect'

In [47]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True, dotenv_path="../.env")
my_api_key = os.getenv("OPEN_AI_API_KEY")

my_client = OpenAI(api_key=my_api_key)
# my_client

def ask_question_open_ai(prompt):

    # print(f"User asked: {prompt}")
    # my_client.chat.completions.create

    llm_response = my_client.chat.completions.create(
        model="gpt-5-nano",
        # messages=[
        #     {"role": "system", "content": "You are a helpful assistant. Answer as concisely as possible."},
        #     {"role": "user", "content": prompt}
        # ]
        messages=[
            {"role": "system", "content": '''
             You are an assistant who answers only based on the given context.
             '''},
            {"role": "user", "content": f"Context: {context}\n\n User Question: {query}"} 
        ]

    )
    return llm_response.choices[0].message.content 

In [48]:
print (query)
response = ask_question_open_ai(query)

what is pay policy


In [None]:
print(f"User query: {query}")
print(f"Context: {context}")

print(f"\n\nOpen AI Response: {response}" )

User query: what is pay policy
Context: Employees are paid bi-weekly via direct deposit.
---
Company internet is a broadband internet.
---
Interact with each employee with Respect


Open AI Response: Pay policy: Employees are paid bi-weekly via direct deposit. If you need more specifics, please consult HR or payroll.
