In [None]:
# core libs - chỉ cài nếu chưa có
import importlib.util
import subprocess
import sys

def install_if_missing(package_name, pip_name=None):
    """Cài đặt package nếu chưa có"""
    if pip_name is None:
        pip_name = package_name
    
    # Kiểm tra package đã cài chưa
    if importlib.util.find_spec(package_name) is None:
        print(f"Installing {pip_name}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pip_name])
    else:
        print(f"{package_name} already installed, skipping...")

packages = [
    ("fitz", "PyMuPDF==1.23.8"),
    ("pdf2image", "pdf2image"),
    ("cv2", "opencv-python-headless"),
    ("numpy", "numpy"),
    ("tqdm", "tqdm"),
    ("requests", "requests"),
    ("paddleocr", "paddleocr"),
    ("pytesseract", "pytesseract"),
    ("paddle", "paddlepaddle"),
]

for package, pip_name in packages:
    install_if_missing(package, pip_name)

print("\nAll packages ready!")

fitz already installed, skipping...
pdf2image already installed, skipping...
cv2 already installed, skipping...
numpy already installed, skipping...
tqdm already installed, skipping...
requests already installed, skipping...
paddleocr already installed, skipping...
pytesseract already installed, skipping...
paddle already installed, skipping...

All packages ready!


# Libraries

In [2]:
import requests
import fitz  # PyMuPDF
import io
import os
import json
import re
from tqdm.auto import tqdm
import numpy as np
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'



import cv2
import matplotlib.pyplot as plt
from PIL import Image

# Config

In [None]:
DPI = 200   
FIGURE_MIN_AREA = 2000  
GEOMETRY_KEYWORDS = [
    # Từ khóa cũ
    "tam giác", "đường tròn", "đoạn thẳng", "góc", "vuông góc", "song song", "trung điểm",
    "tiếp tuyến", "bán kính", "hình chữ nhật", "hình vuông", "hình thang", "chu vi",
    "diện tích", "đỉnh", "chân", "đường cao", "phân giác", "điểm", "giao điểm", "tâm",
    
    # Từ khóa mới - Từ mục lục
    "hình","hình chóp", "tứ giác", "pythagore", "pitago", "định lí", "định lý",
    "hình bình hành", "hình thoi", "hình vuông", "hình chữ nhật",
    "thể tích", "xung quanh", "đáy", "cạnh bên",
    
    # Tiếng Anh
    "triangle", "circle", "rectangle", "square", "parallelogram", "trapezoid",
    "pythagoras", "theorem", "volume", "area", "perimeter",
    
    # Viết tắt và ký hiệu
    "abc", "abcd", "∆", "∠", "⊥", "//", "°",
]

In [20]:
PDF_URL = "https://8486fef5bc.vws.vegacdn.vn/data/doc/2025/thcschuvananq1/2025_2/3/sach-giao-khoa-toan-8-tap-1-chan-troi-sang-tao_3220251.pdf"
OUTPUT_JSON = "geometry_extracted.json"

response = requests.get(PDF_URL)
pdf_bytes = response.content

In [21]:
doc = fitz.open("pdf", pdf_bytes)
print(f"PDF loaded. Total pages: {len(doc)}")

PDF loaded. Total pages: 130


# Extract text from PDF

In [7]:
def pdf_to_images(doc, dpi=200):
    images = []
    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=dpi)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        images.append((i, img))
    return images

images = pdf_to_images(doc, dpi=200)
print(f"Total pages converted to images: {len(images)}")

Total pages converted to images: 130


In [10]:
geometry_pages_dir = "geometry_pages"
os.makedirs(geometry_pages_dir, exist_ok=True)

In [22]:
geometry_pages = []
geometry_results = []

num_pages_to_check = len(images) 
print(f"Checking {num_pages_to_check} pages\n")

for idx in tqdm(range(num_pages_to_check), desc="Processing pages"):
    page_num, img = images[idx]
    
    # Method 1: Extract text directly from PDF
    page = doc[page_num]
    text_pdf = page.get_text().lower()
    
    # Method 2: OCR to extract text from image
    try:
        text_ocr = pytesseract.image_to_string(img, lang='vie+eng').lower()
    except:
        text_ocr = ""
    
    # Combine both methods
    text = text_pdf + " " + text_ocr
    
    # Check for geometry keywords
    if any(keyword in text for keyword in GEOMETRY_KEYWORDS):
        geometry_pages.append(page_num)
        
        # Save page image
        img_path = os.path.join(geometry_pages_dir, f"page_{page_num+1}.png")
        img.save(img_path)
        
        # Save page information
        matched_kw = [kw for kw in GEOMETRY_KEYWORDS if kw in text]
        geometry_results.append({
            "page": page_num + 1,
            "image_path": img_path,
            "detected_text_pdf": text_pdf[:250],
            "detected_text_ocr": text_ocr[:250],
            "matched_keywords": matched_kw
        })
        
        print(f"\nPage {page_num+1}: FOUND - {matched_kw[:3]}...")

print(f"RESULTS: Found {len(geometry_pages)} geometry pages")
print(f"Pages: {[p+1 for p in geometry_pages]}")

if geometry_pages:
    print(f"\nImages saved at: {os.path.abspath(geometry_pages_dir)}")
    
    # Save JSON file
    detail_json = os.path.join(geometry_pages_dir, "detail.json")
    with open(detail_json, "w", encoding="utf-8") as f:
        json.dump(geometry_results, f, ensure_ascii=False, indent=2)
    print(f"Details saved at: {detail_json}")
else:
    print(f"\nNo geometry pages found in {num_pages_to_check} pages")


Checking 130 pages



Processing pages:   0%|          | 0/130 [00:00<?, ?it/s]


Page 4: FOUND - ['pythagore']...

Page 5: FOUND - ['pythagore', 'xung quanh']...

Page 5: FOUND - ['pythagore', 'xung quanh']...

Page 7: FOUND - ['°']...

Page 7: FOUND - ['°']...

Page 8: FOUND - ['°']...

Page 8: FOUND - ['°']...

Page 9: FOUND - ['°']...

Page 9: FOUND - ['°']...

Page 10: FOUND - ['°']...

Page 10: FOUND - ['°']...

Page 11: FOUND - ['°']...

Page 11: FOUND - ['°']...

Page 12: FOUND - ['xung quanh']...

Page 12: FOUND - ['xung quanh']...

Page 13: FOUND - ['°']...

Page 13: FOUND - ['°']...

Page 14: FOUND - ['°']...

Page 14: FOUND - ['°']...

Page 15: FOUND - ['°']...

Page 15: FOUND - ['°']...

Page 16: FOUND - ['°']...

Page 16: FOUND - ['°']...

Page 17: FOUND - ['°']...

Page 17: FOUND - ['°']...

Page 18: FOUND - ['chu vi', '°']...

Page 18: FOUND - ['chu vi', '°']...

Page 20: FOUND - ['°']...

Page 20: FOUND - ['°']...

Page 21: FOUND - ['°']...

Page 21: FOUND - ['°']...

Page 22: FOUND - ['°']...

Page 22: FOUND - ['°']...

Page 23: FOUND - ['°']...

