# Imports

In [2]:
import numpy as np
import pandas as pd
import fitz  
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import torch
import os
import json
from tqdm.notebook import tqdm
import urllib3


In [3]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
config = json.load(open('config.json'))

# Elasticsearch Initialization

In [4]:
es = Elasticsearch(
    hosts=[{
        "host": "localhost",
        "port": 9200,
        "scheme": "https"  # This should be included here as part of the host dictionary
    }],
    http_auth=("elastic", "BUzQ9CnGPb-lHUxtotuA"),  # Use your username and password
    verify_certs=False  # Disable SSL certificate verification (for self-signed certs)
)

# Test the connection
try:
    if es.ping():
        print("Elasticsearch cluster is up!")
    else:
        print("Elasticsearch cluster is down!")
except Exception as e:
    print(f"Error connecting to Elasticsearch: {e}")

  _transport = transport_class(
  es = Elasticsearch(


Elasticsearch cluster is up!


In [5]:
es.indices.exists(index="pdf_text_chunks")

HeadApiResponse(True)

## Raw Data Files

In [5]:
def absoluteFilePaths(directory):
    for dirpath,_,filenames in os.walk(directory):
        for f in filenames:
            yield os.path.abspath(os.path.join(dirpath, f))

In [6]:
raw_data_paths = absoluteFilePaths('data')

# QA

## Storage

In [9]:
es

<Elasticsearch(['https://localhost:9200'])>

## Initialize Models

In [6]:
# Storage and Retrieval
embedder_model = AutoModel.from_pretrained(config['MODEL']['embedder_model'])
embedder_tokenizer = AutoTokenizer.from_pretrained(config['MODEL']['embedder_model'])


# qa_tokenizer = AutoTokenizer.from_pretrained(config['MODEL']['qa_pipeline_model'])
# qa_model = AutoModelForQuestionAnswering.from_pretrained(
#         config['MODEL']['qa_pipeline_model'],
#         torch_dtype=torch.float32,  # Use half precision
#         low_cpu_mem_usage=True
#     )
qa_pipeline = pipeline(
        "question-answering",
        model=config['MODEL']['qa_pipeline_model'],
        tokenizer=config['MODEL']['qa_pipeline_model']
    )



## Embedding Utils

In [7]:
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModel, pipeline
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import torch


index_mapping = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "embedding": {"type": "dense_vector", "dims": config['MODEL']['embedding_dim']} 
        }
    }
}

# es.indices.create(index="pdf_text_chunks", body=index_mapping)

def extract_text_from_pdf(pdf_path):
    """Extracts text from each page in a PDF."""
    doc = fitz.open(pdf_path)
    text_data = []
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text_data.append(page.get_text("text"))
    return text_data

def chunk_text(text, chunk_size=300):
    """Splits text into smaller chunks for embedding."""
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def embed_text(text):
    """Generate embeddings for a text chunk."""
    inputs = embedder_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    embeddings = embedder_model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().tolist()[0]

def index_text_chunks(text_chunks, doc_id):
    """Index text chunks in Elasticsearch with embeddings."""
    actions = [
        {
            "_index": "pdf_text_chunks",
            "_id": f"{doc_id}_{i}",
            "_source": {
                "text": chunk,
                "embedding": embed_text(chunk)
            }
        }
        for i, chunk in enumerate(text_chunks)
    ]
    bulk(es, actions)

# Load PDFs and process

# pdf_files = ["path/to/pdf1.pdf", "path/to/pdf2.pdf"]

# for pdf_path in pdf_files:
# for pdf_path in raw_data_paths:
#     print(pdf_path)
#     text_data = extract_text_from_pdf(pdf_path)
#     for page_num, page_text in tqdm(enumerate(text_data)):
#         chunks = chunk_text(page_text)
#         index_text_chunks(chunks, doc_id=f"{pdf_path}_page_{page_num}")



## Storage

In [None]:


for pdf_path in raw_data_paths:
    print(pdf_path)
    text_data = extract_text_from_pdf(pdf_path)
    for page_num, page_text in tqdm(enumerate(text_data)):
        chunks = chunk_text(page_text)
        index_text_chunks(chunks, doc_id=f"{pdf_path}_page_{page_num}")

In [11]:
es.indices.get_mapping(index='pdf_text_chunks')

ObjectApiResponse({'pdf_text_chunks': {'mappings': {'properties': {'embedding': {'type': 'float'}, 'text': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}})

In [12]:
es.indices.delete(index="pdf_text_chunks")

ObjectApiResponse({'acknowledged': True})

## Retrieval

In [9]:
def retrieve_relevant_chunks(question, top_k=3):
    """Retrieve top K most relevant chunks based on the question."""
    # Embed the question
    question_embedding = embed_text(question)
    query_body = {
        "size": top_k,
        "_source": ["text"],
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                    "params": {"query_vector": question_embedding}
                }
            }
        }
    }
    res = es.search(index="pdf_text_chunks", body=query_body)
    return [hit["_source"]["text"] for hit in res["hits"]["hits"]]

# def generate_answer(question, context):
#     """Generate answer based on context using a generative model."""
#     input_text = f"Question: {question}\nContext: {' '.join(context)}"
#     return qa_pipeline(input_text)[0]["generated_text"]

def generate_answer(question, context):
    """Generate answer based on context using a generative model."""
    
    context = ' '.join(context)
    result = qa_pipeline(question=question, context=context)
    return result['answer']
    
# Main QA Function
def answer_question(question):
    context_chunks = retrieve_relevant_chunks(question)
    for context_chunk in context_chunks:
        print(f"chunk : {context_chunk}")
    answer = generate_answer(question, context_chunks)
    return answer

# Example Usage
# question = "What are the key findings in the report?"
# print("Answer:", answer_question(question))


In [10]:
question = "What are different types of leaves available to Simpplr employees ?"

In [11]:
print("Answer:", answer_question(question))

  res = es.search(index="pdf_text_chunks", body=query_body)


chunk : Employee Stock Options Policy 1. Introduction 1.1 Purpose This policy outlines the guidelines and procedures for granting and managing employee stock options at Simpplr. The purpose is to provide employees with an additional incentive and reward for their loyalty, commitment, and contribution towards the growth and success of the organization. 1.2 Scope This policy applies to all full-time employees of Simpplr who meet the eligibility criteria and are selected to participate in the employee stock option program. 2. Eligibility 2.1 Criteria Employees who have completed a minimum of one year of continuous service with Simpplr and are in good standing are eligible to participate in the employee stock option program. 2.2 Participation Participation in the employee stock option program is voluntary, and eligible employees must indicate their interest within a specified deadline communicated by the Human Resources department. 3. Stock Option Granting 3.1 Granting Process The granting

In [31]:
# question = "When is an employee eligible for remote work at Simpplr ?"
# print("Answer:", answer_question(question))

In [12]:
question = "According to the company policy, answer the following question : When is an employee eligible for remote work at Simpplr ?"
print("Answer:", answer_question(question))
# answer_question(question)

  res = es.search(index="pdf_text_chunks", body=query_body)


chunk : Title: Remote Work Policy Introduction: Simpplr recognizes the growing need for flexibility in the workplace and understands the benefits of remote work for both employees and the organization. The Remote Work Policy outlines the provisions and guidelines for employees who wish to work remotely, either on a full-time or ad-hoc basis. This policy aims to promote a healthy work-life balance, increase productivity, and create a positive work environment for our employees. Policy Statement: Simpplr is committed to providing employees with the opportunity to work remotely, subject to business needs and the nature of their roles. This policy applies to all eligible employees, regardless of their tenure or position within the organization. Remote work may be approved when it benefits both the employee and the company while maintaining the highest level of productivity and collaboration. Eligibility for Remote Work: 1. Employment Eligibility: a. All employees who have completed at leas

In [32]:
import transformers
transformers.__version__

'4.33.1'