In [None]:
!pip install sentence_transformers PyPDF2

In [None]:
!huggingface-cli login



In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("hkunlp/instructor-large")

sentences = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [4, 4]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
import torch
import numpy as np

# Setup device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

# Load embedding and language models
embedding_model = SentenceTransformer('hkunlp/instructor-large', device=device)
language_model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(language_model_name)
language_model = AutoModelForCausalLM.from_pretrained(
   language_model_name,
   device_map="auto",
   torch_dtype=torch.float16
).to(device)

# Extract text from PDF with improved cleaning
def extract_text_from_pdf(pdf_path):
   reader = PdfReader(pdf_path)
   full_text = ""
   for page in reader.pages:
       text = page.extract_text()
       # Basic cleaning of extracted text
       text = text.replace('\n', ' ')
       text = text.replace('  ', ' ')  # Remove double spaces
       text = ' '.join(text.split())
       full_text += text + " "
   return full_text

# Improved text splitting function
def split_text(text, chunk_size=1000):
   words = text.split()
   chunks = []
   current_chunk = []
   current_length = 0

   for word in words:
       if current_length + len(word) > chunk_size:
           chunks.append(' '.join(current_chunk))
           current_chunk = [word]
           current_length = len(word)
       else:
           current_chunk.append(word)
           current_length += len(word) + 1  # +1 for space

   if current_chunk:
       chunks.append(' '.join(current_chunk))

   return chunks

# Create embeddings
def create_embeddings(chunks):
   return embedding_model.encode(chunks, convert_to_tensor=True)

# Improved search function with debugging
def search_relevant_chunks(question, chunks, embeddings, top_k=3):
   question_embedding = embedding_model.encode(question, convert_to_tensor=True)
   similarities = torch.matmul(embeddings, question_embedding.T).cpu().numpy()
   top_indices = np.argsort(similarities, axis=0)[-top_k:][::-1]
   selected_chunks = [chunks[idx] for idx in top_indices.flatten()]

   # Debug print
   print("\nRelevant excerpts found:")
   for i, chunk in enumerate(selected_chunks, 1):
       print(f"\nExcerpt {i}:\n{chunk[:200]}...")

   return selected_chunks

# Improved prompt creation
def create_prompt(relevant_chunks, question):
   context = "\n\n".join(relevant_chunks)
   return f"""Based on the following excerpt from 'The Lightning Thief', please answer the question. If the information is not directly stated in the excerpt, please say so.

Excerpt:
{context}

Question: {question}
Answer (based only on the excerpt above): """

# Improved chat function
def chat_with_books(pdf_path):
   print("Loading and analyzing book...")
   try:
       book_text = extract_text_from_pdf(pdf_path)
       print(f"Successfully extracted {len(book_text)} characters of text")

       chunks = split_text(book_text)
       print(f"Split into {len(chunks)} chunks")

       print("Creating embeddings...")
       embeddings = create_embeddings(chunks)
       print("Embeddings created successfully!")

       print("\nWelcome! Type 'exit' to end chat.")
       print("Type 'debug' to see the first chunk of text.")

       while True:
           user_input = input("\nYou: ").strip()

           if user_input.lower() == "exit":
               print("Goodbye!")
               break

           if user_input.lower() == "debug":
               print("\nFirst chunk of text:")
               print(chunks[0][:500])
               continue

           relevant_chunks = search_relevant_chunks(user_input, chunks, embeddings)
           prompt = create_prompt(relevant_chunks, user_input)

           inputs = tokenizer(
               prompt,
               return_tensors="pt",
               truncation=True,
               max_length=1024
           ).to(device)

           outputs = language_model.generate(
               **inputs,
               max_length=1024,
               temperature=0.1,
               top_p=0.9,
               do_sample=True,
               pad_token_id=tokenizer.eos_token_id,
               num_return_sequences=1
           )

           response = tokenizer.decode(
               outputs[:, inputs["input_ids"].shape[-1]:][0],
               skip_special_tokens=True
           )
           print(f"\nModel: {response}")

   except Exception as e:
       print(f"An error occurred: {str(e)}")
       import traceback
       print(traceback.format_exc())

# Run program
if __name__ == "__main__":
   pdf_path = "/content/The_Lightning_Thief_-_Percy_Jackson_1-10.pdf"  # Replace with actual book path
   chat_with_books(pdf_path)