In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [5]:
import fitz  # PyMuPDF
import os
import re

def clean_text(text):
    """Applies basic cleaning rules to the extracted text."""
    text = text.replace('-\n', '')
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# --- KEY CHANGE HERE ---
# Specify the exact PDF file you want to process.
# 🔽 REPLACE "your_document.pdf" WITH THE NAME OF YOUR FILE 🔽
pdf_filename = "/content/drive/MyDrive/PathologyRobbins7ed.pdf"

print(f"--- Attempting to process: {pdf_filename} ---")

# Check if the specified file exists before trying to open it.
if os.path.exists(pdf_filename):
    try:
        # Open the specified PDF document
        doc = fitz.open(pdf_filename)
        full_raw_text = ""
        for page in doc:
            full_raw_text += page.get_text()
        doc.close()

        # Clean the extracted text
        cleaned_text = clean_text(full_raw_text)

        # Create the output filename (e.g., "your_document.txt")
        txt_filename = os.path.splitext(pdf_filename)[0] + ".txt"

        # Save the cleaned text to the new .txt file
        with open(txt_filename, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        print(f"✅ Successfully extracted and saved to: {txt_filename}")

    except Exception as e:
        print(f"❌ An error occurred while processing {pdf_filename}: {e}")
else:
    # This message will show if the file isn't found in the same directory.
    print(f"❌ ERROR: The file '{pdf_filename}' was not found.")

--- Attempting to process: /content/drive/MyDrive/PathologyRobbins7ed.pdf ---
✅ Successfully extracted and saved to: /content/drive/MyDrive/PathologyRobbins7ed.txt


In [6]:
!pip install langchain langchain_community langchain-google-genai sentence-transformers faiss-cpu



In [8]:
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# --- KEY CHANGE HERE ---
# Specify the exact .txt file you want to process.
# 🔽 REPLACE "your_document.txt" WITH THE NAME OF YOUR FILE 🔽
txt_filename = "/content/drive/MyDrive/PathologyRobbins7ed.txt"

print(f"--- Attempting to process: {txt_filename} ---")

# 1. Check if the specified file exists and then load it
if os.path.exists(txt_filename):
    loader = TextLoader(txt_filename, encoding='utf-8')
    documents = loader.load()

    # 2. Chunk the document
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    print(f"Split the document into {len(docs)} chunks.")

    # Check if there are any chunks to process
    if not docs:
        print("❌ ERROR: The text file was loaded but resulted in zero chunks. The file might be empty.")
    else:
        # 3. Create embeddings and store them in a FAISS vector database
        print("Creating embeddings... (This may take a moment)")
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        db = FAISS.from_documents(docs, embeddings)

        # 4. Save the vector database locally
        db.save_local("my_faiss_index")
        print("✅ Vector database created successfully from the specified file!")

else:
    # This message will show if the file isn't found
    print(f"❌ ERROR: The file '{txt_filename}' was not found.")

--- Attempting to process: /content/drive/MyDrive/PathologyRobbins7ed.txt ---
Split the document into 2682 chunks.
Creating embeddings... (This may take a moment)


  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Vector database created successfully from the specified file!


In [3]:
# Install the required library for Together AI
!pip install langchain-together -q

import os
from google.colab import userdata

# Import the new LLM class
from langchain_together import Together
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- 1. GET YOUR TOGETHER AI API KEY FROM COLAB SECRETS ---
try:
    TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
    # Set it as an environment variable for the library to use
    os.environ['TOGETHER_API_KEY'] = TOGETHER_API_KEY
    print("✅ Successfully retrieved API key from Colab Secrets.")
except Exception as e:
    print(f"❌ ERROR: Could not retrieve TOGETHER_API_KEY. Please complete Step 2 above.")
    # Stop execution if the key is not found
    raise e

# --- 2. LOAD THE VECTOR DATABASE ---
db_path = "/content/drive/MyDrive/my_faiss_index"

if not os.path.exists(db_path):
    print(f"❌ ERROR: The database folder '{db_path}' was not found.")
    print("Please make sure you have run the previous script to create the database.")
else:
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True)
    print("✅ Vector database loaded successfully.")

    # --- 3. CREATE THE QUESTION-ANSWERING CHAIN WITH TOGETHER AI ---
    # We are replacing GoogleGenerativeAI with Together
    llm = Together(
        model="meta-llama/Llama-3-70b-chat-hf", # A powerful and stable model
        temperature=0.1,
        max_tokens=1024
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=db.as_retriever(search_kwargs={"k": 3})
    )

    # --- 4. ASK QUESTIONS INTERACTIVELY ---
    print("\n✅ Setup complete! You can now ask questions about your document using Together AI.")
    print("Type 'exit' to quit.")

    while True:
        try:
            question = input("\n-> Ask a question: ")
            if question.lower() == 'exit':
                print("Exiting program. Goodbye!")
                break

            result = qa_chain.invoke({"query": question})
            print("\n--- Answer ---")
            print(result['result'])

        except KeyboardInterrupt:
            print("\n\nProgram interrupted by user. Goodbye!")
            break

✅ Successfully retrieved API key from Colab Secrets.
✅ Vector database loaded successfully.

✅ Setup complete! You can now ask questions about your document using Together AI.
Type 'exit' to quit.

-> Ask a question: "What is the difference between apoptosis and necrosis according to the text?"

--- Answer ---
 According to the text, apoptosis is a type of cell death characterized by nuclear dissolution without complete loss of membrane integrity, whereas necrosis is a type of cell death characterized by disruption of the plasma membrane and leakage of cellular components. Additionally, apoptosis serves normal functions and is not necessarily associated with cell injury, whereas necrosis is always a pathologic process.


Program interrupted by user. Goodbye!
