<a href="https://colab.research.google.com/github/jayw20230711/COLAB/blob/main/kg-rag-Ch02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
import os

# --- Configuration ---
remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "ch02-downloaded.pdf"

print(f"Attempting to download PDF from: {remote_pdf_url}")

try:
    # Use a timeout to prevent the script from hanging indefinitely
    response = requests.get(remote_pdf_url, timeout=15)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Write the content of the response to a file in binary mode ('wb')
        with open(pdf_filename, "wb") as pdf_file:
            pdf_file.write(response.content)

        # Confirmation message
        print(f"✅ Success! PDF saved as: {os.path.abspath(pdf_filename)}")
    else:
        # Handle non-200 status codes (e.g., 404 Not Found, 500 Server Error)
        print(f"❌ Failed to download the PDF. HTTP Status Code: {response.status_code}")

except requests.exceptions.RequestException as e:
    # Handle connection errors, DNS errors, timeouts, etc.
    print(f"❌ An error occurred during the request: {e}")

except IOError as e:
    # Handle file writing errors
    print(f"❌ An error occurred while writing the file: {e}")

Attempting to download PDF from: https://arxiv.org/pdf/1709.00666.pdf
✅ Success! PDF saved as: /content/ch02-downloaded.pdf


In [7]:
!pip install pdfplumber
import pdfplumber
import os

# --- Configuration (Must match the name used in the download script) ---
pdf_filename = "ch02-downloaded.pdf"
text = ""

# Check if the PDF file exists before trying to open it
if not os.path.exists(pdf_filename):
    print(f"❌ Error: File not found. Please ensure '{pdf_filename}' was downloaded successfully in the previous step.")
else:
    try:
        print(f"Reading text from: {pdf_filename}...")

        with pdfplumber.open(pdf_filename) as pdf:
            for i, page in enumerate(pdf.pages):
                # Using += might be memory-intensive for huge files,
                # but is fine for this example.
                extracted_page_text = page.extract_text()
                if extracted_page_text:
                    text += extracted_page_text + "\n"

        # Check if any text was extracted
        if text:
            print(f"✅ Success! Extracted {len(text)} characters.")
            print("\n--- First 20 characters of the extracted text ---")
            print(text[0:20])
        else:
            print("⚠️ Warning: Successfully opened the PDF, but no text could be extracted.")

    except Exception as e:
        print(f"❌ An unexpected error occurred while processing the PDF: {e}")

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
# If you run into ModuleNotFoundError, uncomment the line below and run it first
!pip install google-genai neo4j python-dotenv

# Use IPython/Jupyter magic commands to load environment variables from .env
# This requires a file named .env in the same directory as the notebook
%load_ext dotenv
%dotenv

import os
from google import genai
from neo4j import GraphDatabase

# --- 1. Set up the Gemini (GenAI) client ---

# Retrieve API key from environment variables (loaded by %dotenv)
gemini_api_key = os.getenv("GEMINI_API_KEY")

if not gemini_api_key:
    # Print a clear error if the key is missing
    print("❌ ERROR: GEMINI_API_KEY not found in environment variables.")
    print("Please ensure you have a .env file with GEMINI_API_KEY='YOUR_API_KEY'.")
    genai_client = None
else:
    print("✅ GEMINI_API_KEY loaded successfully.")
    # Initialize the client
    genai_client = genai.Client(api_key=gemini_api_key)
    print("✅ Gemini Client initialized.")

# NOTE: You would typically add the Neo4j setup here as well,
# but for now, we focus on fixing the Gemini client part.

# Example of how you would initialize Neo4j (if needed later)
# NEO4J_URI = os.getenv("NEO4J_URI")
# NEO4J_USER = os.getenv("NEO4J_USER")
# NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
# neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

Collecting neo4j
  Downloading neo4j-6.0.3-py3-none-any.whl.metadata (5.2 kB)
Downloading neo4j-6.0.3-py3-none-any.whl (325 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-6.0.3
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
cannot find .env file
❌ ERROR: GEMINI_API_KEY not found in environment variables.
Please ensure you have a .env file with GEMINI_API_KEY='YOUR_API_KEY'.
