In [None]:
!pip install -q pymupdf pytesseract
# Note: Tesseract OCR phải được cài đặt riêng trên Windows
# Download từ: https://github.com/UB-Mannheim/tesseract/wiki

# Libraries

In [8]:
import os
import fitz  # PyMuPDF
import cv2
import pytesseract
import re
import requests
from urllib.parse import urlparse
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
OCR_LANG = 'vie'  

# Utils

In [None]:
def download_pdf_from_url(url, save_dir="input", chunk_size=8192):
    os.makedirs(save_dir, exist_ok=True)
    filename = os.path.basename(urlparse(url).path)
    if not filename.endswith(".pdf"):
        filename = "document.pdf"

    save_path = os.path.join(save_dir, filename)
    if os.path.exists(save_path):
        print(f"PDF already exists: {save_path}")
        return save_path

    print(f"Download PDF...")
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(save_path, "wb") as f:
            for chunk in r.iter_content(chunk_size):
                if chunk:
                    f.write(chunk)
    
    print(f"Saved: {save_path}")
    return save_path

In [None]:
def pdf_to_images(pdf_path, out_dir="pdf_pages", dpi=200):
    os.makedirs(out_dir, exist_ok=True)
    doc = fitz.open(pdf_path)

    image_paths = []
    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=dpi)
        img_path = f"{out_dir}/page_{i+1:03d}.png"
        pix.save(img_path)
        image_paths.append(img_path)

    print(f"Converted {len(image_paths)} pages to images")
    return image_paths

In [None]:
def ocr_page_text(image_path):
    img = cv2.imread(image_path)
    text = pytesseract.image_to_string(img, lang=OCR_LANG)
    text = re.sub(r'\s+', ' ', text)  # Clean up whitespace
    return text

In [None]:
PDF_URL = "https://84864e12bc.vws.vegacdn.vn//data/doc/2025/thcslienninh/2025_2/26/sach-bai-tap-toan-8-tap-1-ket-noi-tri-thuc-voi-cuoc-song_262202515.pdf"

pdf_file_path = download_pdf_from_url(PDF_URL, save_dir="./input")
page_images = pdf_to_images(pdf_file_path, out_dir="output/pages", dpi=200)

PDF already exists: ./input\sach-bai-tap-toan-8-tap-1-ket-noi-tri-thuc-voi-cuoc-song_262202515.pdf
Converted 113 pages to images


In [None]:
def extract_problems_with_figures(text):
    # Pattern tìm reference đến hình vẽ - linh hoạt với OCR errors
    figure_patterns = [
        (r'Hình\s*\d+[.\s]*\d+', 'Hình X.Y'),          # Hình 5.1, Hình 53 
        (r'\(H\.?\d+(?:\.\d+)?\)', '(H.X.Y)'),         # (H.3.4), (H3.5)
        (r'trong\s+(?:các\s+)?hình\s+(?:vẽ\s+)?(?:sau|trên)', 'trong hình...'),
    ]
    
    results = []
    
    # Strategy 1: Tìm các vị trí "Giải" (nếu có)
    giai_matches = list(re.finditer(r'Giải', text, re.IGNORECASE))
    
    # Strategy 2: Tìm các vị trí số thứ tự bài tập (X.Y.., X.Y., etc.)
    problem_number_pattern = r'(\d+\.\d+)\.\.'
    problem_starts = list(re.finditer(problem_number_pattern, text))
    
    # Nếu có pattern số thứ tự, ưu tiên dùng nó để tách bài
    if problem_starts:
        for i, match in enumerate(problem_starts):
            start_pos = match.start()
            end_pos = problem_starts[i+1].start() if i+1 < len(problem_starts) else len(text)
            
            segment = text[start_pos:end_pos].strip()
            problem_num = match.group(1)
            
            # Kiểm tra xem segment có đề cập hình không
            found_figures = []
            for fig_pattern, _ in figure_patterns:
                matches = re.findall(fig_pattern, segment, re.IGNORECASE)
                found_figures.extend(matches)
            
            if found_figures:
                # Tìm xem có "Giải" trong segment không
                giai_match = re.search(r'Giải', segment, re.IGNORECASE)
                
                if giai_match:
                    question_text = segment[:giai_match.start()].strip()
                    solution_text = segment[giai_match.end():].strip()
                else:
                    question_text = segment
                    solution_text = ""
                
                # Clean up question
                question_text = re.sub(r'^[-@•()\s=]+', '', question_text).strip()
                question_text = re.sub(r'^\d+\.\d+\.\.', '', question_text).strip()
                
                results.append({
                    'problem_number': problem_num,
                    'content': segment,
                    'question': question_text,
                    'solution': solution_text,
                    'figures': list(set(found_figures)),
                    'has_solution': bool(giai_match)
                })
    
    # Nếu không có pattern số thứ tự, dùng strategy cũ với "Giải"
    elif giai_matches:
        problem_num = 0
        for giai_match in giai_matches:
            giai_pos = giai_match.start()
            
            search_start = 0
            for prev_match in giai_matches:
                if prev_match.start() < giai_pos:
                    search_start = prev_match.end()
            
            question_segment = text[search_start:giai_pos].strip()
            
            # Kiểm tra xem có pattern hình nào không
            found_figures = []
            for fig_pattern, _ in figure_patterns:
                matches = re.findall(fig_pattern, question_segment, re.IGNORECASE)
                found_figures.extend(matches)
            
            if found_figures:
                problem_num += 1
                
                solution_end = len(text)
                for next_match in giai_matches:
                    if next_match.start() > giai_pos:
                        solution_end = next_match.start()
                        break
                
                solution_text = text[giai_match.end():solution_end].strip()
                
                question_text = re.sub(r'^[-@•()\s=]+', '', question_segment).strip()
                
                # Tách question thực sự - tìm câu cuối cùng có đề cập hình
                sentences = question_text.split('.')
                question_final = []
                for sent in reversed(sentences):
                    question_final.insert(0, sent)
                    has_fig = False
                    for fig_pattern, _ in figure_patterns:
                        if re.search(fig_pattern, sent, re.IGNORECASE):
                            has_fig = True
                            break
                    if has_fig:
                        break
                
                question_text = '.'.join(question_final).strip()
                
                results.append({
                    'problem_number': str(problem_num),
                    'content': text[search_start:solution_end].strip(),
                    'question': question_text,
                    'solution': solution_text,
                    'figures': list(set(found_figures)),
                    'has_solution': True
                })
    
    # Nếu không có cả "Giải" và số thứ tự, tìm tất cả đề cập hình và tách theo câu
    else:
        # Tìm tất cả vị trí có đề cập hình
        all_figure_matches = []
        for fig_pattern, _ in figure_patterns:
            for match in re.finditer(fig_pattern, text, re.IGNORECASE):
                all_figure_matches.append((match.start(), match.end(), match.group()))
        
        all_figure_matches.sort(key=lambda x: x[0])
        
        # Group theo khoảng cách gần nhau
        if all_figure_matches:
            problem_num = 0
            for i, (start, end, fig_ref) in enumerate(all_figure_matches):
                # Tìm context xung quanh (200 ký tự trước và sau)
                context_start = max(0, start - 200)
                context_end = min(len(text), end + 200)
                
                segment = text[context_start:context_end].strip()
                problem_num += 1
                
                results.append({
                    'problem_number': str(problem_num),
                    'content': segment,
                    'question': segment,
                    'solution': "",
                    'figures': [fig_ref],
                    'has_solution': False
                })
    
    return results

# Starting

In [71]:
pages_dir = os.path.join("output", "pages")

if not os.path.exists(pages_dir):
    print(f"Not found: {pages_dir}")
else:
    # Get all page image files
    page_images = sorted([
        os.path.join(pages_dir, f) 
        for f in os.listdir(pages_dir) 
        if f.endswith('.png')
    ])
    
    print("OCR text from pages")
    print(f"Folder: {pages_dir}")
    print(f"Total pages: {len(page_images)}\n")
    
    # OCR each page
    page_texts = {}
    for idx, page_img in enumerate(page_images):
        page_num = idx + 1
        print(f"[{page_num}/{len(page_images)}] OCR {os.path.basename(page_img)}...", end=" ")
        text = ocr_page_text(page_img)
        page_texts[page_num] = text
        print(f"({len(text)} words)")
    
    # Extract questions with figure references
    print("Split the question with (H.xx)")
    
    all_problems = []
    for page_num, text in page_texts.items():
        problems = extract_problems_with_figures(text)
        if problems:
            print(f"Page {page_num}: {len(problems)} question")
        for prob in problems:
            prob['page_number'] = page_num
            all_problems.append(prob)
    
    print(f"\nTotal: {len(all_problems)} questions have diagrams")
    print(f"Pages: {len(page_texts)}")

OCR text from pages
Folder: output\pages
Total pages: 113

[1/113] OCR page_001.png... (30 words)
[2/113] OCR page_002.png... (0 words)
[3/113] OCR page_003.png... (1672 words)
[4/113] OCR page_004.png... (279 words)
[5/113] OCR page_005.png... (1127 words)
[6/113] OCR page_006.png... (1185 words)
[7/113] OCR page_007.png... (1214 words)
[8/113] OCR page_008.png... (1011 words)
[9/113] OCR page_009.png... (1107 words)
[10/113] OCR page_010.png... (893 words)
[11/113] OCR page_011.png... (424 words)
[12/113] OCR page_012.png... (884 words)
[13/113] OCR page_013.png... (1068 words)
[14/113] OCR page_014.png... (484 words)
[15/113] OCR page_015.png... (1150 words)
[16/113] OCR page_016.png... (1191 words)
[17/113] OCR page_017.png... (1254 words)
[18/113] OCR page_018.png... (1375 words)
[19/113] OCR page_019.png... (811 words)
[20/113] OCR page_020.png... (875 words)
[21/113] OCR page_021.png... (668 words)
[22/113] OCR page_022.png... (780 words)
[23/113] OCR page_023.png... (572 words)

# Save

In [None]:
import json
from datetime import datetime

output_folder = os.path.join("..", "dataset", "output", "mapped_result")
os.makedirs(output_folder, exist_ok=True)

# Prepare data structure
output_data = {
    "metadata": {
        "pdf_path": pdf_file_path,
        "total_pages": len(page_texts),
        "total_problems": len(all_problems),
        "problems_with_solution": sum(1 for p in all_problems if p.get('has_solution', False)),
        "problems_without_solution": sum(1 for p in all_problems if not p.get('has_solution', False)),
        "processed_at": datetime.now().isoformat()
    },
    "problems": []
}

for i, prob in enumerate(all_problems, 1):
    output_data["problems"].append({
        "id": i,
        "problem_number": prob['problem_number'],
        "page_number": prob.get('page_number'),
        "question": prob['question'],
        "solution": prob.get('solution', ''),
        "figure_references": prob['figures'],  
        "has_solution": prob.get('has_solution', False)
    })

output_file = os.path.join(output_folder, "problems_with_figures.json")
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"Folder: {output_folder}")
print(f"File: problems_with_figures.json")
print(f"   Total problems: {output_data['metadata']['total_problems']}")
print(f"   ├─ With solution: {output_data['metadata']['problems_with_solution']}")
print(f"   └─ Without solution: {output_data['metadata']['problems_without_solution']}")
print(f"   Total pages: {output_data['metadata']['total_pages']}")

Folder: ..\dataset\output\mapped_result
File: problems_with_figures.json
   Total problems: 112
   ├─ With solution: 14
   └─ Without solution: 98
   Total pages: 113
