In [3]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import multiprocessing
import torch
import fitz

In [4]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

generation_model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

: 

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

In [None]:
def chunk_document(text, chunk_size=500):
    sentences = text.split(".")
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > chunk_size:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + "."
        else:
            current_chunk += sentence + "."
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

In [None]:
def embed_document(documents):
    embeddings = embedding_model.encode(documents)
    return np.array(embeddings)

In [None]:
def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

In [None]:
def retrieve_relevant_documents(query, documents, faiss_index, top_k=3):
    query_embedding = embedding_model.encode([query])
    
    distances, indices = faiss_index.search(query_embedding, top_k)
    
    relevant_docs = [documents[i] for i in indices[0]]
    return relevant_docs

In [None]:
def generate_response(query, relevant_docs):
    prompt = " ".join(relevant_docs) + "\nQuestion: " + query + "\nAnswer:"
    
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = generation_model.generate(inputs, max_length=200, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

In [None]:
def process_user_query(user_id, query, documents, faiss_index):
    relevant_docs = retrieve_relevant_documents(query, documents, faiss_index)
    response = generate_response(query, relevant_docs)
    return (user_id, response)

In [6]:
from dotenv import load_dotenv

load_dotenv()

from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

prompt = "What is the capital of France?"

response = chat.invoke(prompt)

print("ChatGPT Response:", response.content)

ChatGPT Response: The capital of France is Paris.
