# Import the required libraries 

In [None]:
import os
import path
import math
import numpy as np
import pandas as pd
from threading import Thread   # for thread creation

import torch

import fitz  # PyMuPDF   for pdf file reading 

import faiss  # to find chuck cosine similiarity using Eculediean distance 

from transformers import AutoTokenizer, AutoModel  # for chunk embeddings 

from transformers import GPTNeoForCausalLM, GPT2Tokenizer  #pretrained model used for generation or synthesis 


import logging
from flask import Flask, request, jsonify, render_template  # to Create Web server 

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="IPython.core.interactiveshell")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Used Machine for the assignment = {device}')

#This is  to handle issues related to multiple instances of the OpenMP runtime in Python applications.
# This is often done to avoid runtime errors when using libraries that rely on OpenMP for parallel processing, such as NumPy or FAISS    
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'  


In [None]:
#Configuration Parameters 
chunk_size = 16
chunk_overlap = 4
K = 3  ## Number of nearest chunks to retrieve
index_file = 'vector_index.faiss'

#UI web server settings 
host_name = '127.0.0.1'  #currently set to run in localhost machine
port_no = 9000

# Data Acquisition 

In [None]:
# Input data files are available in the "../Data/" directory.
def get_doc_list(file_dir):
    doc_list = [os.path.join(file_dir, file) for file
                 in sorted(os.listdir(file_dir)) if file[-4:] == '.pdf'] 
    return doc_list
   
def read_pdf(file_path):
    document = fitz.open(file_path)
    text = ""
    for page in document:  # Iterate through each page
        text += page.get_text()  # Extract text from the current page
    document.close()
    return text




#  Ingestion 


In [None]:
#Ingestion

def chunk_text_with_overlap(text, chunksize, overlap):
    # Split the text into words
    words = text.split()
    
    # Create chunks with overlap
    chunks = []
    for i in range(0, len(words), chunksize - overlap):
        chunk = " ".join(words[i:i + chunksize])
        chunks.append(chunk)
        if i + chunksize >= len(words):
            break  # Stop if the next chunk goes out of bounds
    
    return chunks
    import torch


# Load the tokenizer and model for  chunk embeddings 
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Example model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embeddings(chunks):
    embeddings = []
    with torch.no_grad():
        for chunk in chunks:
            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
            outputs = model(**inputs)
            # Use the mean of the last hidden states as the chunk embedding
            chunk_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(chunk_embedding)
    return embeddings

def display_chunks_embeddings(chunks,chunk_embeddings):
    for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):  
        print(f"Chunk {i + 1}: {chunk}") 

#Vector DB to store chunks into vector Database, here FAISS  is used
def do_indexing(chunk_embeddings):
    embedding_dimension = chunk_embeddings[0].shape[0]
    print(embedding_dimension)

    # Create a FAISS index
    index = faiss.IndexFlatL2(embedding_dimension)

    # Convert chunk embeddings to a 2D numpy array
    chunk_embeddings = np.array(chunk_embeddings).astype('float32')

    # Ensure the shape is correct
    print("Shape of chunk_embeddings:", chunk_embeddings.shape)  # Should be (n, d)

    # Add embeddings to the index
    index.add(chunk_embeddings)
    # Store the index to disk
    faiss.write_index(index, index_file)
    return index

def get_stored_index(index_file):
    # Load the index from disk
    loaded_index = faiss.read_index(index_file)
    return loaded_index 


#  Retreival 

In [None]:
#Retrieval 
def retrieve_chunks(index,query, k):
    query_embedding = get_embeddings([query])
    query_embedding = np.array(query_embedding).astype('float32')
    print("Shape of query_embedding:", query_embedding.shape)
    D, I = index.search(query_embedding, k)  # D: distances, I: indices
    retreived_chunk_list=[]
    for i in range(k):
        retreived_chunk_list.append(chunks[I[0][i]])
    #print(retreived_chunk_list)
    return retreived_chunk_list

#  Synthesis

In [None]:
#Synthesis or Generation
# Load the tokenizer and model
gpt_model_name = "EleutherAI/gpt-neo-2.7B"
gpt_tokenizer = GPT2Tokenizer.from_pretrained(gpt_model_name)
gpt_model = GPTNeoForCausalLM.from_pretrained(gpt_model_name)

def generate_response(index ,query,k):
    retrieved_chuncks = retrieve_chunks(index,query, k)
    context = " ".join(retrieved_chuncks)
    input_text = f"Context: {context}\nQuery: {query}\nResponse:"
    #print(input_text)  
    inputs = gpt_tokenizer(input_text, return_tensors="pt")
    #print(inputs)
    outputs = gpt_model.generate(**inputs, max_length=100)    
    return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
   



#  Web Server  (User Interface)

In [None]:
app = Flask(__name__, template_folder=os.path.abspath('templates'))

# Set up logging
logging.basicConfig(level=logging.DEBUG)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/ask', methods=['POST'])
def process_question():
    print('process_question()') 
    try:
        question = request.json['question']
        app.logger.info(f"Received question: {question}")
        print(question)
        answer = generate_response(index,question,K)
        app.logger.info(f"Generated answer: {answer}")
        return jsonify({'answer': answer})
    except Exception as e:
        app.logger.error(f"An error occurred: {str(e)}", exc_info=True)
        return jsonify({'error': str(e)}), 500

def run_app():
    app.run(debug=True, use_reloader=False, host=host_name, port=f'{port_no}')

#  Main Application

In [None]:
if __name__=="__main__":
    doc_dir = './Data'
    doc_list = get_doc_list(doc_dir)
    print(doc_list)
    doc_content = read_pdf(doc_list[0])
    #print(doc_content)
    
    #Create chunks 
    chunks = chunk_text_with_overlap(doc_content, chunk_size, chunk_overlap)

    # Create embeddings for the chunks
    chunk_embeddings = get_embeddings(chunks)

    #Indexing
    index = do_indexing(chunk_embeddings)

    thread = Thread(target=run_app)
    thread.start()

    print(f"Flask server is running on http://{host_name}:{port_no}")

    '''
    # Example usage
    query = "what is natural language processing"
    response = generate_response(index,query,K)
    print(response)
    '''
    
    





