1a

In [None]:
!pip install PyMuPDF numpy sentence-transformers torch



In [None]:
import os
import json
import re
from pathlib import Path
import fitz  # PyMuPDF
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from collections import defaultdict, Counter

# --- Heading Extraction Heuristics ---
# General noise patterns, excluding page numbers which will be handled by H/F detection or more specific noise rules
SKIP_LINE_PATTERNS = [
    r'^\.{3,}$',            # Ellipses
    r'^\s*[\W_]+\s*$',      # Lines with only punctuation/symbols/underscores (e.g., "---")
    r'^[A-Z]{1,3}\s*$',     # Short uppercase lines (e.g., section markers 'I', 'II', 'A') - still useful for one-letter elements
    r'^\s*\[\s*\d+\s*\]\s*$', # Citation numbers like "[9]"
    r'^\s*(\([a-zA-Z0-9]+\)|\d+\))\s*$', # Generic list item markers like (a), (i), 1) - but if they are part of a heading number, HEADING_PATTERNS takes precedence
    r'^(Fig\.|Figure)\s*\d+[.:]?\s*', # Figure captions that might be mistaken for headings
    r'^Table\s*\d+[.:]?\s*',     # Table captions
]

BOLD_INDICATORS = ['bold', 'black', 'heavy', 'semibold', 'demi', 'demibold', 'extrabold']
ITALIC_INDICATORS = ['italic', 'oblique']

# More specific patterns for actual headings, distinguishing from list items
HEADING_PATTERNS = {
    'roman_numerals': r'^[IVXLCDM]+\.?\s+', # Roman numerals followed by space (e.g., I. Introduction)
    'alphabetic': r'^[A-Z]\.?\s+',        # Single uppercase letter followed by space (e.g., A. Section)
    'numeric_simple': r'^\d+\.?\s+',       # Single number followed by space (e.g., 1. Chapter)
    'numeric_multilevel': r'^\d+(\.\d+)+\.?\s+', # Multi-level numbers (e.g., 1.1, 2.1.3)
}

EMBED_MODEL = SentenceTransformer('intfloat/e5-small')

# --- Header/Footer Detection Global Variables ---
# Store confirmed header/footer patterns
# Key: (normalized_text_segment_lower, rounded_y_avg, rounded_font_size_avg, is_bold, rounded_x_avg)
# Value: count of pages seen on (not used directly in matching, but for confidence)
HEADER_FOOTER_CANDIDATES = set() # Use a set for unique confirmed H/F segments

# Intermediate storage for H/F identification
# Key: (rounded_y, rounded_font_size, is_bold, rounded_x) -> Value: list of (page_num, raw_text)
HEADER_FOOTER_SPATIAL_SLOTS = defaultdict(lambda: defaultdict(list)) # slot_key -> page_num -> list of raw_text

# Configuration for H/F detection
HEADER_FOOTER_MIN_PAGES_REQUIRED = 2 # Minimum number of pages a pattern must appear on
HEADER_FOOTER_SAMPLE_PAGES = 20 # Max pages to scan for H/F identification (e.g., first 20 for better statistical robustness)

HEADER_FOOTER_Y_TOLERANCE_PX = 4 # Pixel tolerance for Y-coordinate comparison (slightly reduced for tighter clusters)
HEADER_FONT_SIZE_TOLERANCE_PX = 0.75 # Font size tolerance for H/F comparison
HEADER_X_COORD_TOLERANCE_PX = 6 # X-coordinate tolerance for H/F comparison
HEADER_FOOTER_THRESHOLD_PAGES_RATIO = 0.5

HEADER_REGION_START_Y_RATIO = 0.0 # From top of page
HEADER_REGION_END_Y_RATIO = 0.15 # End of header region (e.g., top 15%)
FOOTER_REGION_START_Y_RATIO = 0.85 # Start of footer region (e.g., bottom 15% - 100%)
FOOTER_REGION_END_Y_RATIO = 1.0 # To bottom of page

# Regex patterns for normalizing common dynamic parts within H/F for more robust matching
PAGE_INFO_REGEX_NORM = r'(page\s*\d+\s*(of\s*\d+)?|pg\.\s*\d+|-\s*\d+\s*-|\s*\d+\s*-\s*|^\s*\d+\s*$)' # Covers "page X", "page X of Y", "pg. X", "- X -", "X -" and standalone "X"
DATE_TIME_REGEX_NORM = r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2},?\s+\d{4}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:\d{1,2}:?\d{1,2}\s*(?:AM|PM)?)\b'
VERSION_NUM_REGEX_NORM = r'(v(?:ersion)?\s*\d+(?:\.\d+)*)' # e.g., "Version 1.0", "v2.3"
COPYRIGHT_REGEX_NORM = r'©|copyright|all rights reserved'
GUID_REGEX_NORM = r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b' # Matches GUIDs

def normalize_hf_text_for_fingerprint(text):
    """
    Normalizes text likely to appear in headers/footers by replacing variable parts with placeholders.
    This is for creating consistent fingerprints for structural identification.
    """
    normalized = text.strip().lower()

    # Replace dynamic elements with generic placeholders
    normalized = re.sub(PAGE_INFO_REGEX_NORM, '#PAGENUM#', normalized, flags=re.IGNORECASE)
    normalized = re.sub(DATE_TIME_REGEX_NORM, '#DATETIME#', normalized, flags=re.IGNORECASE)
    normalized = re.sub(VERSION_NUM_REGEX_NORM, '#VERSION#', normalized, flags=re.IGNORECASE)
    normalized = re.sub(COPYRIGHT_REGEX_NORM, '#COPYRIGHT#', normalized, flags=re.IGNORECASE)
    normalized = re.sub(GUID_REGEX_NORM, '#GUID#', normalized, flags=re.IGNORECASE)

    # Replace any remaining isolated numbers (e.g., serial numbers not part of page info)
    normalized = re.sub(r'\b\d+\b', '#NUM#', normalized)

    # Reduce multiple spaces to single space
    normalized = re.sub(r'\s+', ' ', normalized).strip()

    return normalized

def extract_fixed_substrings(text_list, min_len=5, min_occurrence_ratio=0.7):
    """
    Analyzes a list of text strings (from a common H/F slot) to find recurring fixed substrings.
    """
    if not text_list:
        return []

    # Get common tokens / words first
    all_words = []
    for t in text_list:
        all_words.extend(re.findall(r'\b\w+\b', t.lower()))
    word_counts = Counter(all_words)
    common_words = {word for word, count in word_counts.items() if count / len(text_list) >= min_occurrence_ratio}

    fixed_segments = set()
    for i in range(len(text_list[0])):
        for j in range(i + min_len, len(text_list[0]) + 1):
            segment = text_list[0][i:j].lower()
            if not segment.strip() or len(segment.strip()) < min_len:
                continue

            # Check if this segment exists in a high percentage of other strings
            is_consistent = True
            count = 0
            for other_text in text_list:
                if segment in other_text.lower():
                    count += 1
            if count / len(text_list) >= min_occurrence_ratio:
                fixed_segments.add(segment.strip())

    # Also add combinations of common words
    if common_words:
        for text_line in text_list:
            current_segment = []
            for word in re.findall(r'\b\w+\b', text_line.lower()):
                if word in common_words:
                    current_segment.append(word)
                else:
                    if len(" ".join(current_segment)) >= min_len:
                        fixed_segments.add(" ".join(current_segment))
                    current_segment = []
            if len(" ".join(current_segment)) >= min_len:
                fixed_segments.add(" ".join(current_segment))

    return list(fixed_segments)

# --- Helper Functions (continued) ---

def compute_font_percentiles(spans):
    """
    Computes the percentile rank for each span's font size.
    Handles empty spans list gracefully.
    """
    if not spans:
        return {}
    sizes = np.array([s['size'] for s in spans])
    unique_sizes = np.sort(np.unique(sizes))
    size_to_percentile = {
        sz: int(100 * np.sum(sizes <= sz) / len(sizes)) for sz in unique_sizes
    }
    return {id(s): size_to_percentile.get(s['size'], 0) for s in spans}

def is_noise(text):
    """
    Checks if a text line is considered general document noise.
    These are patterns that are unlikely to ever be valid headings or consistent H/F.
    """
    t = text.strip()
    if len(t) < 2 and not t.isdigit(): return True # Filter very short strings unless they are single digits (e.g. part of 1.1)
    for pat in SKIP_LINE_PATTERNS:
        if re.match(pat, t): return True
    return False

def is_bold(font_name):
    """
    Checks if a font name indicates a bold style.
    """
    font_name_lower = font_name.lower().replace('-', '')
    return any(ind in font_name_lower for ind in BOLD_INDICATORS)

def get_adaptive_threshold(scores):
    """
    Calculates an adaptive threshold for heading scores.
    Uses a combination of percentile and mean/std deviation.
    """
    if not scores:
        return 0
    scores = np.array(scores)

    perc_threshold = np.percentile(scores, 70)
    mean_score = np.mean(scores)
    std_dev = np.std(scores)

    std_threshold = mean_score + 0.75 * std_dev

    return max(perc_threshold, std_threshold)

def score_line(text, span, pct_map, avg_size, prev_y, next_y, line_count, page_num, page):
    """
    Scores a line's candidacy as a heading, incorporating more features and nuances.
    """
    original_text = text # Keep original for case checks
    text = text.strip()

    # --- Header/Footer Exclusion First and Foremost ---
    line_y = span['bbox'][1]
    line_font_size = span['size']
    line_is_bold = is_bold(span['font'])
    line_x = span['bbox'][0]

    for hf_key in HEADER_FOOTER_CANDIDATES:
        hf_norm_segment, hf_y_approx, hf_font_size_approx, hf_is_bold_status, hf_x_approx = hf_key

        # Match on approximate Y, font size, bold status, and X-coordinate
        # AND if the normalized segment is found within the current line's text
        # This is the crucial part for flexible H/F matching
        if abs(hf_y_approx - line_y) <= HEADER_FOOTER_Y_TOLERANCE_PX and \
           abs(hf_font_size_approx - line_font_size) <= HEADER_FONT_SIZE_TOLERANCE_PX and \
           abs(hf_x_approx - line_x) <= HEADER_X_COORD_TOLERANCE_PX and \
           hf_is_bold_status == line_is_bold and \
           hf_norm_segment in normalize_hf_text_for_fingerprint(original_text): # Use normalize_hf_text_for_fingerprint for content check
            return -10000 # Extremely low score to ensure removal from candidates

    # If it's not a confirmed H/F, check if it's general noise
    if is_noise(text):
        return -5000 # Very low score for general noise, less than H/F

    score = 0
    size = span['size']
    pct = pct_map.get(id(span), 0)
    font_name = span['font']

    # --- Font Size and Percentile ---
    if pct >= 98: score += 12
    elif pct >= 90: score += 9
    elif pct >= 80: score += 6
    elif pct >= 65: score += 3
    else: score -= 2

    ratio = size / avg_size if avg_size else 1
    score += (ratio - 1) * 8

    # --- Font Style ---
    if is_bold(font_name):
        score += 7

    # --- Text Characteristics ---
    wc = len(text.split())
    if 1 <= wc <= 10:
        score += (5 - abs(wc - 4))
    else:
        score -= abs(wc - 7) * 0.5

    # Case (use original_text for case checks)
    if original_text.istitle() and wc > 1 and not original_text.isupper():
        score += 4
    elif original_text.isupper() and wc <= 8:
        score += 5
    elif original_text.isupper() and wc > 8:
        score -= 3

    # Trailing punctuation
    if text.endswith('.') and not re.match(r'^\d+(\.\d+)*\.$', text):
        score -= 5
    if text.endswith((',', ';', ':')): score -= 3

    # Presence of numbering/markers
    is_numbered_heading = False
    for pat_type, pat in HEADING_PATTERNS.items():
        if re.match(pat, text):
            is_numbered_heading = True
            if pat_type == 'numeric_multilevel': score += 8
            elif pat_type == 'roman_numerals': score += 7
            elif pat_type == 'numeric_simple': score += 6
            elif pat_type == 'alphabetic': score += 5
            break

    # Penalize if it looks like a simple list item marker rather than a heading
    if not is_numbered_heading:
        if re.match(r'^\s*(\d+|\([a-z]\)|\([A-Z]\)|\([ivx]+\)|\.)\s+.*', text.strip()):
            score -= 5 # e.g., "1. This is a list item."
        if re.match(r'^\s*(\d+\.){1,}\s*$', text.strip()):
            score -= 10 # e.g., "1.1." alone on a line

    # --- Vertical Spacing (Contextual) ---
    if prev_y is not None and (span['bbox'][1] - prev_y) > avg_size * 2.5:
        score += 4
    elif prev_y is not None and (span['bbox'][1] - prev_y) > avg_size * 1.8:
        score += 2

    if next_y is not None and (next_y - (span['bbox'][1] + span['bbox'][3] - span['bbox'][1])) > avg_size * 1.8:
        score += 3

    # Position on page (for the *first* page only, as later pages might have lower headings if title spans multiple)
    if page_num == 0 and span['bbox'][1] < page.rect.height / 2:
        score += 3

    return score

def process_page(page, num, page_height, page_width):
    td = page.get_text('dict')

    spans = []
    for b in td['blocks']:
        for l in b.get('lines', []):
            spans.extend(l['spans'])

    pct_map = compute_font_percentiles(spans)

    lines = []
    line_groups = defaultdict(list)
    for b in td['blocks']:
        for l in b.get('lines', []):
            if l['spans']:
                line_y = round(np.mean([s['bbox'][1] for s in l['spans']]), 2)
                line_groups[line_y].extend(l['spans'])

    for y, line_spans in sorted(line_groups.items()):
        valid_spans = [s for s in line_spans if s['text'].strip()]
        if not valid_spans:
            continue
        valid_spans.sort(key=lambda s: s['bbox'][0])
        text = ''.join(s['text'] for s in valid_spans).strip()
        primary_span = max(valid_spans, key=lambda s: s['size']) if valid_spans else None

        if lines and text == lines[-1][1]:
            continue
        if primary_span:
            lines.append((y, text, primary_span))

    lines.sort(key=lambda x: x[0])

    if spans:
        all_sizes = np.array([s['size'] for s in spans])
        lower_bound = np.percentile(all_sizes, 25)
        upper_bound = np.percentile(all_sizes, 75)

        body_text_sizes = [s['size'] for s in spans if lower_bound <= s['size'] <= upper_bound]
        avg_size = np.mean(body_text_sizes) if body_text_sizes else np.mean(all_sizes)
    else:
        avg_size = 0

    cands, scores = [], []
    for i, (y, text, span) in enumerate(lines):
        prev_y = lines[i-1][0] if i > 0 else None
        next_y = lines[i+1][0] if i < len(lines)-1 else None

        sc = score_line(text, span, pct_map, avg_size, prev_y, next_y, len(lines), num, page) # num is 0-based page index

        # Only add candidates with a score higher than the general noise penalty (-5000)
        # This implicitly filters out H/F (-10000) and general noise.
        if sc > -2000: # Threshold above noise and H/F penalties
            cands.append({'text': text, 'page': num, 'score': sc, 'font_size': span['size'], 'y_coord': y, 'span': span, 'x_coord': span['bbox'][0]})
            scores.append(sc)

    thr = get_adaptive_threshold(scores) if scores else 0

    filtered_cands = []
    for c in cands:
        # Filter based on adaptive threshold. Scores below -2000 are already excluded.
        if c['score'] >= thr:
            filtered_cands.append(c)

    return filtered_cands

def merge_multi_line_headings(candidates, threshold_y_diff_ratio=0.5, font_size_tolerance_ratio=0.05, x_coord_tolerance=5):
    """
    Attempts to merge consecutive candidate lines that appear to be a single multi-line heading.
    Merges based on:
    - Same page
    - Vertical proximity (y_coord difference is small, relative to font size)
    - Similar font size and bold status
    - Similar X-coordinate (alignment)
    - The subsequent line typically starts with a lowercase letter or doesn't look like a new heading marker.
    """
    if not candidates:
        return []

    candidates.sort(key=lambda x: (x['page'], x['y_coord'], x['x_coord']))

    merged_candidates = []
    i = 0
    while i < len(candidates):
        current_cand = candidates[i].copy()

        if 'span' not in current_cand:
            merged_candidates.append(current_cand)
            i += 1
            continue

        current_span = current_cand['span']
        is_current_bold = is_bold(current_span['font'])

        j = i + 1
        while j < len(candidates):
            next_cand = candidates[j]
            if 'span' not in next_cand: break

            next_span = next_cand['span']

            if current_cand['page'] != next_cand['page']: break

            y_diff = next_cand['y_coord'] - (current_cand['y_coord'] + current_span['bbox'][3] - current_span['bbox'][1])
            avg_line_height = (current_span['size'] + next_span['size']) / 2

            font_size_similar = abs(current_cand['font_size'] - next_cand['font_size']) / current_cand['font_size'] < font_size_tolerance_ratio
            bold_status_similar = is_current_bold == is_bold(next_span['font'])
            x_coord_similar = abs(current_cand['x_coord'] - next_cand['x_coord']) < x_coord_tolerance

            next_text_starts_lower_or_no_new_heading_marker = (
                next_cand['text'][0].islower() or
                (not any(re.match(p, next_cand['text']) for p in HEADING_PATTERNS.values()))
            )

            if (y_diff < avg_line_height * threshold_y_diff_ratio and
                font_size_similar and
                bold_status_similar and
                x_coord_similar and
                next_text_starts_lower_or_no_new_heading_marker
            ):
                current_cand['text'] += " " + next_cand['text']
                current_span = next_span
                current_cand['span'] = current_span
                j += 1
            else:
                break

        merged_candidates.append(current_cand)
        i = j

    for cand in merged_candidates:
        if 'span' in cand:
            del cand['span']
        # Do NOT delete 'x_coord' here

    return merged_candidates


def select_best_title(candidates, meta_title):
    """
    Selects the best document title from candidates, prioritizing metadata if relevant,
    and using semantic similarity for overall coherence.
    """
    if not candidates and not meta_title:
        return "Untitled Document"

    texts = [c['text'] for c in candidates]

    if not texts and meta_title and len(meta_title) > 3 and not is_noise(meta_title):
        return meta_title

    if not texts:
        return "Untitled Document"

    emb = EMBED_MODEL.encode(texts, convert_to_tensor=True)
    sims = util.cos_sim(emb, emb).sum(dim=1)
    best_idx = int(torch.argmax(sims))
    best_heading_candidate_text = texts[best_idx]

    if meta_title and len(meta_title) > 3 and not is_noise(meta_title):
        try:
            mt_emb = EMBED_MODEL.encode(meta_title, convert_to_tensor=True)
            h_emb = EMBED_MODEL.encode(best_heading_candidate_text, convert_to_tensor=True)

            if util.cos_sim(mt_emb, h_emb).item() > 0.75:
                return meta_title
        except Exception as e:
            pass

    return best_heading_candidate_text

def infer_heading_levels(outline_candidates):
    """
    Infers hierarchical levels (H1, H2, H3) for headings based on font size,
    x-coordinate, and explicit numbering patterns.
    """
    if not outline_candidates:
        return []

    outline_candidates.sort(key=lambda x: (x['page'], x['y_coord'], x['x_coord']))

    unique_styles = defaultdict(lambda: {'texts': [], 'x_coords': [], 'font_sizes': []})
    for h in outline_candidates:
        key = (round(h['font_size'], 1), round(h['x_coord'] / 10) * 10, is_bold(h.get('span', {}).get('font', '')))
        unique_styles[key]['texts'].append(h['text'])
        unique_styles[key]['x_coords'].append(h['x_coord'])
        unique_styles[key]['font_sizes'].append(h['font_size'])

    sorted_styles = sorted(unique_styles.keys(), key=lambda k: (-k[0], k[1]))

    level_map = {}
    assigned_levels = []

    for i, style_key in enumerate(sorted_styles):
        font_size, x_coord_group, bold_status = style_key

        if not assigned_levels:
            level_map[style_key] = 'H1'
            assigned_levels.append('H1')
        else:
            last_assigned_style_key = next(k for k, v in level_map.items() if v == assigned_levels[-1])
            last_font_size, last_x_coord_group, _ = last_assigned_style_key

            font_size_drop = (last_font_size - font_size) / last_font_size if last_font_size else 0
            x_coord_indentation = (x_coord_group - last_x_coord_group)

            is_new_level = False
            if font_size_drop > 0.08 and len(assigned_levels) < 3: # Significant font size drop (e.g., >8%)
                is_new_level = True
            elif x_coord_indentation > 20 and len(assigned_levels) < 3: # Significant indentation (e.g., >20px)
                is_new_level = True

            if is_new_level:
                if assigned_levels[-1] == 'H1':
                    level_map[style_key] = 'H2'
                    assigned_levels.append('H2')
                elif assigned_levels[-1] == 'H2':
                    level_map[style_key] = 'H3'
                    assigned_levels.append('H3')
                else: # Fallback if more than 3 distinct levels identified by font/indentation
                    level_map[style_key] = 'H3'
            else: # If not a new level, assign to the lowest current level
                level_map[style_key] = assigned_levels[-1]

    final_outline = []
    for h in outline_candidates:
        best_match_key = None
        min_diff = float('inf')

        current_style_font_size = round(h['font_size'], 1)
        current_style_x_coord = round(h['x_coord'] / 10) * 10
        current_style_bold_status = is_bold(h.get('span', {}).get('font', ''))

        for style_key in sorted_styles:
            style_font_size, style_x_coord, style_bold_status = style_key

            diff = abs(current_style_font_size - style_font_size) * 10 + \
                   abs(current_style_x_coord - style_x_coord) * 0.5 + \
                   (0 if current_style_bold_status == style_bold_status else 5)

            if diff < min_diff:
                min_diff = diff
                best_match_key = style_key

        level = 'H3' # Default fallback
        if best_match_key and best_match_key in level_map:
            level = level_map[best_match_key]
        elif assigned_levels:
            level = assigned_levels[-1]

        final_outline.append({'level': level, 'text': h['text'], 'page': h['page']-1})

    return final_outline

# --- Header/Footer Identification Function ---
def identify_headers_footers(doc):
    """
    Analyzes the first few pages of a document to identify recurring headers and footers.
    Populates the global HEADER_FOOTER_CANDIDATES with detailed fingerprints.
    """
    global HEADER_FOOTER_CANDIDATES # Declare intent to modify global variable

    HEADER_FOOTER_CANDIDATES.clear() # Reset for each document

    page_sample_size = min(doc.page_count, HEADER_FOOTER_SAMPLE_PAGES)

    # Dictionary to store slot_key -> {page_num: [list_of_raw_texts_in_slot_on_this_page]}
    # slot_key: (rounded_y, rounded_font_size, is_bold, rounded_x)
    spatial_slot_texts = defaultdict(lambda: defaultdict(list))

    for i in range(page_sample_size):
        page = doc.load_page(i)
        page_rect = page.rect
        page_height = page_rect.height

        header_top_y_boundary = page_height * HEADER_REGION_START_Y_RATIO
        header_bottom_y_boundary = page_height * HEADER_REGION_END_Y_RATIO
        footer_top_y_boundary = page_height * FOOTER_REGION_START_Y_RATIO
        footer_bottom_y_boundary = page_height * FOOTER_REGION_END_Y_RATIO

        blocks = page.get_text('dict')['blocks']

        for b in blocks:
            for l in b.get('lines', []):
                for s in l.get('spans', []):
                    raw_text = s['text'].strip()
                    if not raw_text:
                        continue

                    span_y = s['bbox'][1]
                    span_font_size = s['size']
                    span_is_bold = is_bold(s['font'])
                    span_x = s['bbox'][0]

                    is_in_header_region = header_top_y_boundary <= span_y <= header_bottom_y_boundary
                    is_in_footer_region = footer_top_y_boundary <= span_y <= footer_bottom_y_boundary

                    if is_in_header_region or is_in_footer_region:
                        # Create a spatial-stylistic fingerprint for the slot
                        slot_fingerprint = (
                            round(span_y, 0), # Round Y to nearest integer
                            round(span_font_size, 0), # Round font size to nearest integer
                            span_is_bold,
                            round(span_x, 0) # Round X to nearest integer
                        )
                        spatial_slot_texts[slot_fingerprint][i].append(raw_text)

    # Now, analyze the collected texts within each spatial slot to find consistent H/F elements
    for slot_fingerprint, pages_data in spatial_slot_texts.items():
        pages_with_content = len(pages_data)

        # If content in this slot appears on enough pages
        if pages_with_content >= HEADER_FOOTER_MIN_PAGES_REQUIRED and \
           pages_with_content / page_sample_size >= HEADER_FOOTER_THRESHOLD_PAGES_RATIO:

            all_texts_in_slot = []
            for page_num in pages_data:
                all_texts_in_slot.extend(pages_data[page_num])

            # Use the first text line found in this consistent slot as a reference for normalization
            # This is a heuristic: assuming the text structure is broadly similar
            if all_texts_in_slot:
                representative_text = all_texts_in_slot[0]

                # Extract fixed common substrings from all texts in this slot
                # These fixed parts, combined with the slot fingerprint, form the H/F candidate
                # Filter out numbers and dates from fixed segments to prevent false positives if they aren't fully normalized
                fixed_segments = extract_fixed_substrings(all_texts_in_slot, min_len=3, min_occurrence_ratio=0.7) # Min length of 3 for segments

                # If no strong fixed segments, try with aggressive normalization of the representative text
                if not fixed_segments:
                    # Fallback: if no strong common fixed substrings, then use the normalized representative text
                    # but only if it's not purely dynamic (e.g., just page numbers after normalization)
                    normalized_rep_text = normalize_hf_text_for_fingerprint(representative_text)
                    if normalized_rep_text not in ['#pagenum#', '#datetime#', '#version#', '#num#', '#copyright#', '#guid#', '']:
                        fixed_segments.append(normalized_rep_text)

                for segment in fixed_segments:
                    # Add each robustly identified segment with its spatial-stylistic info as a H/F candidate
                    # The segment itself is what we'll check for inclusion in future lines
                    HEADER_FOOTER_CANDIDATES.add((
                        segment,
                        slot_fingerprint[0], # rounded_y
                        slot_fingerprint[1], # rounded_font_size
                        slot_fingerprint[2], # is_bold
                        slot_fingerprint[3]  # rounded_x
                    ))


# Main outline extraction
def extract_outline_from_pdf(path):
    doc = fitz.open(path)
    meta = doc.metadata.get('title', '')

    # Step 0: Identify headers/footers first
    identify_headers_footers(doc)

    all_cands = []
    # Iterate using 0-based page index
    for i in range(doc.page_count):
        pg = doc.load_page(i)
        page_height = pg.rect.height
        page_width = pg.rect.width
        all_cands.extend(process_page(pg, i, page_height, page_width)) # Pass 0-based index 'i'

    doc.close()

    # Step 1: Handle multi-line headings
    all_cands_merged = merge_multi_line_headings(all_cands)

    # Step 2: De-duplicate merged candidates, keeping the one with the highest score
    seen_texts = {}
    for c in sorted(all_cands_merged, key=lambda x: -x['score']):
        t_lower = c['text'].lower().strip()
        if not t_lower: continue
        # Normalized text for deduplication: remove common leading numbers/letters and excess whitespace
        # This is for deduplication of candidate headings, not H/F.
        normalized_text = re.sub(r'^\s*(\d+(\.\d+)*\s*|[A-Z]\.?\s*|[IVXLCDM]+\.?\s*)\s*', '', t_lower).strip()

        if normalized_text not in seen_texts:
            seen_texts[normalized_text] = c
        else:
            if c['score'] > seen_texts[normalized_text]['score']:
                seen_texts[normalized_text] = c

    uniq = list(seen_texts.values())
    uniq.sort(key=lambda x: (x['page']-1, x['y_coord'], x['x_coord']))

    title = select_best_title(uniq, meta) if uniq or meta else Path(path).stem

    outline_candidates = [h for h in uniq if h['text'].strip() != title.strip()]

    # Step 3: Infer multi-level headings
    final_outline = infer_heading_levels(outline_candidates)

    return {'title': title, 'outline': final_outline}

if __name__ == '__main__':
    os.makedirs('/content/input/', exist_ok=True)
    os.makedirs('/content/output/', exist_ok=True)

    print("Processing PDFs in 'input' directory...")
    processed_count = 0

    for f in os.listdir('input'):
        if f.lower().endswith('.pdf'):
            pdf_path = os.path.join('input', f)
            print(f"   Processing {f}...")
            try:
                res = extract_outline_from_pdf(pdf_path)
                output_filename = Path(f).stem + '.json'
                with open(os.path.join('output', output_filename), 'w', encoding='utf-8') as fw:
                    json.dump(res, fw, indent=4, ensure_ascii=False)
                print(f"    Successfully extracted outline to {output_filename}")
                processed_count += 1
            except Exception as e:
                print(f"    Error processing {f}: {e}")
                import traceback
                traceback.print_exc()

    if processed_count == 0:
        print("\nNo PDFs found in the 'input' directory or an error occurred during processing.")
        print("Please place PDF files in the 'input' folder and run the script again.")
    else:
        print(f"\nFinished processing {processed_count} PDF(s).")

Processing PDFs in 'input' directory...
   Processing Basic-Hindi-I-1619720368.pdf...
    Successfully extracted outline to Basic-Hindi-I-1619720368.json

Finished processing 1 PDF(s).


1b

In [None]:
!pip install PyMuPDF sentence_transformers torch ctranslate2 sentencepiece rank_bm25 langchain-community langchain-core langchain-huggingface faiss-cpu numpy

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting ctranslate2
  Downloading ctranslate2-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
import json
import re
import os
from pathlib import Path
import datetime
import torch
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import numpy as np
import shutil
import subprocess
from uuid import uuid4
import pickle
import time

# --- Core Dependencies ---
try:
    import fitz  # PyMuPDF
except ImportError:
    print("❌ FATAL: PyMuPDF is not installed. Please run: pip install PyMuPDF")
    exit(1)

try:
    from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, AutoTokenizer
    import ctranslate2
    import sentencepiece as spm
except ImportError:
    print("❌ FATAL: AI libraries are missing. Please run: pip install transformers torch ctranslate2 sentencepiece")
    exit(1)

try:
    from rank_bm25 import BM25Okapi
except ImportError:
    print("❌ FATAL: BM25 library is not installed. Please run: pip install rank-bm25")
    exit(1)

try:
    from langchain_community.vectorstores import FAISS
    from langchain_huggingface import HuggingFaceEmbeddings
    from langchain_core.documents import Document
except ImportError:
    print("❌ FATAL: LangChain/FAISS libraries are missing. Please run: pip install langchain-community langchain-core langchain-huggingface faiss-cpu")
    exit(1)


# --- Model & Pipeline Configuration ---
RETRIEVAL_MODEL_NAME = 'intfloat/e5-small'
GENERATIVE_MODEL_NAME = 'google/flan-t5-base'
GENERATIVE_MODEL_QUANTIZATION = 'int8'

CT2_MODEL_DIR = Path("./ct2_models")
HF_MODEL_SOURCE_PATH = CT2_MODEL_DIR / "hf_model_source"
FAISS_DB_PATH = Path("./faiss_index_db")
FAISS_INDEX_FILE_NAME = "faiss_index"
FAISS_METADATA_FILE_NAME = "faiss_metadata.pkl"

ENCODING_BATCH_SIZE = 128
LEXICAL_SEARCH_TOP_K = 100
SEMANTIC_SEARCH_TOP_N = 30
FINAL_TOP_K = 5

# --- Global Model Placeholders & Device Configuration ---
retrieval_embedding_model = None
generative_tokenizer = None
generative_ct2_translator = None
generative_model_pytorch_fallback = None
device = 'cpu' # torch.device("cuda" if torch.cuda.is_available() else "cpu") # Keep as CPU for broader compatibility as per previous code

langchain_vector_store = None
faiss_indexed_documents_map = {}


# --- Helper Functions for FAISS Persistence (outside a class) ---

def _load_faiss_index(embeddings: HuggingFaceEmbeddings) -> FAISS:
    """Loads FAISS index from disk, or creates a new empty one."""
    global faiss_indexed_documents_map

    FAISS_DB_PATH.mkdir(parents=True, exist_ok=True)

    if FAISS_DB_PATH.is_dir():
        print(f"Clearing existing FAISS database directory: {FAISS_DB_PATH} to create a new database.")
        max_retries = 5
        for i in range(max_retries):
            try:
                if FAISS_DB_PATH.exists() and len(os.listdir(FAISS_DB_PATH)) > 0:
                    shutil.rmtree(FAISS_DB_PATH)
                    FAISS_DB_PATH.mkdir(parents=True, exist_ok=True)
                    print(f"Successfully cleared {FAISS_DB_PATH}.")
                    break
                else:
                    print(f"FAISS_DB_PATH '{FAISS_DB_PATH}' is already empty or does not exist. Proceeding.")
                    break
            except OSError as e:
                print(f"❌ Attempt {i+1}/{max_retries}: Error clearing {FAISS_DB_PATH}: {e}. Retrying in 1 second...")
                time.sleep(1)
        else:
            print(f"❌ FATAL: Failed to clear FAISS database directory {FAISS_DB_PATH} after {max_retries} attempts. Cannot create a new database.")
            exit(1)

    print("Creating a new FAISS index.")

    dummy_doc_id = str(uuid4())
    dummy_document = Document(
        page_content="This is a dummy document to initialize the FAISS index.",
        metadata={"source": "dummy", "id": dummy_doc_id},
        id=dummy_doc_id
    )

    new_vector_store = FAISS.from_documents([dummy_document], embeddings)

    try:
        new_vector_store.delete([dummy_doc_id])
        print(f"Successfully removed dummy document with ID: {dummy_doc_id}")
    except Exception as e:
        print(f"Warning: Could not delete dummy document from FAISS index: {e}. Index may contain dummy data.")

    faiss_indexed_documents_map = {}

    return new_vector_store


def _save_faiss_index(faiss_store: FAISS, metadata_map: dict):
    """Saves the current FAISS index and associated metadata to disk."""
    FAISS_DB_PATH.mkdir(parents=True, exist_ok=True)
    try:
        faiss_store.save_local(folder_path=str(FAISS_DB_PATH), index_name=FAISS_INDEX_FILE_NAME)

        with open(FAISS_DB_PATH / FAISS_METADATA_FILE_NAME, 'wb') as f:
            pickle.dump(metadata_map, f)
        print(f"      > FAISS index and metadata saved to {FAISS_DB_PATH}.")
    except Exception as e:
        print(f"❌ Error saving FAISS index or metadata: {e}")


def _add_documents_to_faiss(documents: list[Document], faiss_store: FAISS):
    """Adds LangChain Document objects to the FAISS vector store and updates map."""
    global faiss_indexed_documents_map

    if not documents:
        return

    new_documents_to_add = []
    for doc in documents:
        if doc.id is None:
            doc.id = str(uuid4())
            doc.metadata["id"] = doc.id

        if doc.id not in faiss_indexed_documents_map:
            new_documents_to_add.append(doc)
        else:
            print(f"      > Document with ID {doc.id} (from {doc.metadata.get('source', 'unknown')}) already in map. Skipping add to FAISS.")

    if not new_documents_to_add:
        print("      > No new documents to add to FAISS.")
        return

    print(f"      > Adding {len(new_documents_to_add)} new documents to FAISS vector store...")
    try:
        faiss_store.add_documents(documents=new_documents_to_add)

        for doc in new_documents_to_add:
            faiss_indexed_documents_map[doc.id] = doc

        _save_faiss_index(faiss_store, faiss_indexed_documents_map)
        print(f"      > Successfully added {len(new_documents_to_add)} documents to FAISS. Total in DB: {len(faiss_indexed_documents_map)}")
    except Exception as e:
        print(f"❌ Error adding documents to FAISS: {e}")

def _query_faiss_documents(query_text: str, n_results: int) -> list[tuple[Document, float]]:
    """
    Queries the FAISS vector store for relevant LangChain Document objects with scores.
    Returns a list of (Document, score) tuples. Score is a pseudo-cosine similarity (higher is better).
    """
    if not langchain_vector_store or len(faiss_indexed_documents_map) == 0:
        print("      > FAISS vector store is empty. Cannot perform semantic search.")
        return []

    print(f"\n[Stage 2b/4] Performing semantic search using LangChain FAISS...")

    retrieved_documents_with_l2_distances = langchain_vector_store.similarity_search_with_score(
        query=query_text,
        k=n_results,
    )

    final_ranked_results = []
    for doc, l2_distance in retrieved_documents_with_l2_distances:
        # Normalize L2 distance to a 0-1 similarity score (higher is better)
        # Assuming L2 distance 0 means perfect match (score 1), and larger distances
        # mean less similarity. A simple inverse relationship or exponential decay can be used.
        # For simplicity, let's use a normalized inverse: 1 / (1 + distance)
        # Or a more robust one, scaled by a typical max distance.
        # Given that L2 distances are often not bounded, a simple transformation might be better
        # if the absolute values of distances vary greatly.
        # For now, stick to the original logic which attempts to map to cosine similarity:
        relevance_score = 1.0 - (l2_distance**2) / 2.0
        relevance_score = max(0.0, min(1.0, relevance_score))

        final_ranked_results.append((doc, relevance_score))

    final_ranked_results.sort(key=lambda x: x[1], reverse=True)

    print(f"      > Top {len(final_ranked_results)} documents retrieved from LangChain FAISS.")
    return final_ranked_results


# --- Model Conversion and Loading ---

def convert_and_quantize_flan_t5(model_name: str, target_output_dir: Path, quantization_type: str) -> bool:
    """
    Converts a Hugging Face Transformer model to CTranslate2 format with quantization.
    Returns True if conversion is successful, False otherwise.
    """
    print(f"Attempting to convert {model_name} to CTranslate2 {quantization_type} format...")

    if not HF_MODEL_SOURCE_PATH.is_dir():
        print(f"Hugging Face source model not found at {HF_MODEL_SOURCE_PATH}. Saving now...")
        HF_MODEL_SOURCE_PATH.mkdir(parents=True, exist_ok=True)
        try:
            hf_tokenizer_temp = AutoTokenizer.from_pretrained(model_name)
            hf_model_temp = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            hf_model_temp.save_pretrained(HF_MODEL_SOURCE_PATH)
            hf_tokenizer_temp.save_pretrained(HF_MODEL_SOURCE_PATH)
            print(f"Hugging Face model saved to {HF_MODEL_SOURCE_PATH}.")
        except Exception as e:
            print(f"❌ Error saving Hugging Face model to {HF_MODEL_SOURCE_PATH}: {e}")
            return False
    else:
        print(f"Using existing Hugging Face model source at {HF_MODEL_SOURCE_PATH}.")

    if target_output_dir.is_dir():
        print(f"Clearing existing CTranslate2 output directory: {target_output_dir}")
        try:
            shutil.rmtree(target_output_dir)
        except OSError as e:
            print(f"Warning: Could not remove directory {target_output_dir}: {e}. Retrying conversion anyway.")
    target_output_dir.mkdir(parents=True, exist_ok=True)

    try:
        command = [
            "ct2-transformers-converter",
            f"--model {HF_MODEL_SOURCE_PATH}",
            f"--output_dir {target_output_dir}",
            f"--quantization {quantization_type}" if quantization_type else "",
            "--copy_files tokenizer.json tokenizer_config.json special_tokens_map.json",
            "--force"
        ]

        cmd_str = " ".join(filter(None, command))
        print(f"Running conversion command: {cmd_str}")
        result = subprocess.run(cmd_str, shell=True, check=False, capture_output=True, text=True)

        if result.returncode != 0:
            print(f"❌ CTranslate2 conversion failed with exit code {result.returncode}.")
            print(f"STDOUT:\n{result.stdout}")
            print(f"STDERR:\n{result.stderr}")
            return False

        print(f"Conversion to {quantization_type} complete and saved to {target_output_dir}.")
        return True

    except FileNotFoundError:
        print("❌ Error: 'ct2-transformers-converter' command not found. Ensure CTranslate2 is installed and its scripts are in your PATH.")
        return False
    except Exception as e:
        print(f"❌ Error during CTranslate2 conversion of {model_name}: {e}")
        return False
    finally:
        pass

def load_models():
    """Loads AI models and initializes the LangChain FAISS vector store."""
    global retrieval_embedding_model, generative_tokenizer, generative_ct2_translator, generative_model_pytorch_fallback, langchain_vector_store

    print(f"Using device: {device.upper()}")

    try:
        print(f"Loading retrieval embedding model (via LangChain HuggingFaceEmbeddings): {RETRIEVAL_MODEL_NAME}...")
        retrieval_embedding_model = HuggingFaceEmbeddings(
            model_name=RETRIEVAL_MODEL_NAME,
            model_kwargs={'device': device}
        )
        print("Retrieval embedding model loaded successfully.")
    except Exception as e:
        print(f"Fatal: Could not load retrieval embedding model. Error: {e}")
        exit(1)

    print(f"Loading generative model: {GENERATIVE_MODEL_NAME}...")
    try:
        generative_tokenizer = T5Tokenizer.from_pretrained(GENERATIVE_MODEL_NAME)

        ct2_model_path = CT2_MODEL_DIR / GENERATIVE_MODEL_QUANTIZATION
        if GENERATIVE_MODEL_QUANTIZATION and ct2_model_path.is_dir():
            print(f"Attempting to load CTranslate2 model from {ct2_model_path}...")
            generative_ct2_translator = ctranslate2.Translator(str(ct2_model_path), device=device)
            print(f"CTranslate2 model ({GENERATIVE_MODEL_QUANTIZATION}) loaded successfully.")
            generative_model_pytorch_fallback = None
        else:
            if GENERATIVE_MODEL_QUANTIZATION:
                print(f"CTranslate2 model for {GENERATIVE_MODEL_QUANTIZATION} not available. Falling back to PyTorch CPU model.")
            else:
                print("Loading PyTorch model on CPU (no quantization requested).")

            generative_model_pytorch_fallback = AutoModelForSeq2SeqLM.from_pretrained(GENERATIVE_MODEL_NAME).to(device)
            generative_model_pytorch_fallback.eval()

        print("Generative model pipeline loaded successfully.")

    except Exception as e:
        print(f"Fatal: Could not load generative AI models. Error: {e}")
        exit(1)

    # Initialize LangChain FAISS vector store
    try:
        langchain_vector_store = _load_faiss_index(retrieval_embedding_model)
        print("LangChain FAISS vector store initialized successfully.")

    except Exception as e:
        print(f"Fatal: Could not initialize LangChain FAISS vector store. Error: {e}")
        exit(1)


# --- ADVANCED CONTENT EXTRACTION ---

def is_bold_font(font_name: str) -> bool:
    """Checks if a font name suggests a bold style."""
    font_name_lower = font_name.lower()
    bold_indicators = ['bold', 'black', 'heavy', 'semibold', 'demi', 'condb', 'extrabold', 'bd']
    return any(indicator in font_name_lower for indicator in bold_indicators)

def calculate_heading_score(span: dict, page_width: float, avg_font_size: float, line_text: str = "") -> float:
    """Calculates a score for a line of text to determine if it's a heading."""
    score = 0.0
    text = line_text.strip()
    if not text or len(text) < 3: return -100

    size_ratio = span['size'] / avg_font_size if avg_font_size > 0 else 0
    if size_ratio > 1.05: score += (size_ratio - 1.05) * 20
    elif size_ratio < 0.95: score -= (0.95 - size_ratio) * 10

    if is_bold_font(span['font']): score += 5

    word_count = len(text.split())
    if word_count < 15: score += 3
    else: score -= (word_count - 15) * 0.5

    if text.istitle() or (text.isupper() and len(text.split()) > 1): score += 3

    if text.endswith(('.', ',', ';', ':')): score -= 6

    if re.match(r"^(the|a|an|it|this|that|which|when|where|how)\b", text.lower()):
        score -= 2

    return score

def extract_content_chunks(pdf_path: Path) -> list[Document]:
    """
    Extracts content and formats it as a list of LangChain Document objects.
    """
    doc = None
    lc_documents = []
    try:
        doc = fitz.open(pdf_path)
        if doc.page_count == 0:
            print(f"      > Warning: {pdf_path.name} is empty.")
            return []

        all_candidates = []
        for page_num, page in enumerate(doc, 1):
            page_font_sizes = []
            lines_on_page = []
            blocks = page.get_text("dict")["blocks"]

            for b in blocks:
                if "lines" in b:
                    for l in b["lines"]:
                        line_text = "".join(s['text'] for s in l['spans']).strip()
                        if line_text and l['spans']:
                            lines_on_page.append({'text': line_text, 'span': l['spans'][0], 'y_pos': l['spans'][0]['bbox'][1]})
                            for s in l['spans']: page_font_sizes.append(s['size'])

            if not page_font_sizes:
                continue

            avg_font_size_page = np.mean(page_font_sizes)

            for line in lines_on_page:
                score = calculate_heading_score(line['span'], page.rect.width, avg_font_size_page, line['text'])
                if score > 3: # Threshold for considering a line a heading candidate
                    all_candidates.append({'text': line['text'], 'page': page_num, 'y_pos': line['y_pos'], 'score': score})

        headings_found = []
        seen_texts = set()
        # Sort by score descending, then page ascending, then y_pos ascending for consistent ordering
        all_candidates.sort(key=lambda x: (x['score'], x['page'], x['y_pos']), reverse=True) # Changed sort to prioritize higher score first, then page/pos
        for h in all_candidates:
            # Add a check for duplicate headings on different pages that might refer to the same logical section
            if h['score'] > 6 and h['text'].lower() not in seen_texts: # Increased threshold for stronger heading candidates
                headings_found.append({'text': h['text'], 'page': h['page'], 'y_pos': h['y_pos']})
                seen_texts.add(h['text'].lower())

        # Final sort by page and y_pos for correct document flow
        headings_found.sort(key=lambda x: (x['page'], x['y_pos']))

        for i, heading in enumerate(headings_found):
            content = [] # Use a list to collect lines and then join, to preserve some structure
            start_page, start_y = heading['page'], heading['y_pos']
            end_page = doc.page_count
            end_y = float('inf')

            if i + 1 < len(headings_found):
                next_heading = headings_found[i+1]
                end_page = next_heading['page']
                end_y = next_heading['y_pos']

            for p_num_idx in range(start_page - 1, end_page):
                page_blocks = doc[p_num_idx].get_text("blocks", sort=True)

                for block in page_blocks:
                    block_text = block[4].strip()
                    block_y0 = block[1]

                    is_after_start = (p_num_idx > start_page - 1) or \
                                     (p_num_idx == start_page - 1 and block_y0 > start_y)

                    is_before_end = (p_num_idx < end_page - 1) or \
                                    (p_num_idx == end_page - 1 and block_y0 < end_y)

                    if is_after_start and is_before_end and block_text and \
                       block_text.lower().strip() != heading['text'].lower().strip(): # More robust check for heading text itself
                        # Attempt to filter out page numbers, headers/footers if they are single lines
                        if len(block_text.split()) > 3 or not re.match(r'^\d+$', block_text.strip()): # Filter out single numbers (page numbers)
                            content.append(block_text.replace('\n', ' ')) # Replace newlines within a block with space

            joined_content = " ".join(content).strip()
            if joined_content:
                doc_id = str(uuid4())
                lc_documents.append(
                    Document(
                        page_content=joined_content,
                        metadata={
                            "source": str(pdf_path.name),
                            "page_number": heading['page'],
                            "section_title": heading['text'],
                            "id": doc_id # Ensure ID is always in metadata
                        },
                        id=doc_id
                    )
                )

        if not lc_documents and doc.page_count > 0:
            print(f"      > Warning: No structural headings found in {pdf_path.name}. Falling back to paragraph extraction.")
            for page_num, page in enumerate(doc, 1):
                # Split by double newline to get paragraphs, filter for reasonable length
                paragraphs = [p.strip() for p in page.get_text("text").split('\n\n') if len(p.split()) > 20]
                for p_idx, p in enumerate(paragraphs):
                    if p:
                        doc_id = str(uuid4())
                        lc_documents.append(
                            Document(
                                page_content=p,
                                metadata={
                                    "source": str(pdf_path.name),
                                    "page_number": page_num,
                                    "section_title": f"Paragraph {p_idx + 1} (Page {page_num})",
                                    "id": doc_id # Ensure ID is always in metadata
                                },
                                id=doc_id
                            )
                        )
        return lc_documents

    except Exception as e:
        print(f"Error processing {pdf_path.name}: {e}")
        return []
    finally:
        if doc: doc.close()

def lexical_pre_filter(all_documents: list[Document], query: str) -> list[Document]:
    """Stage 1/4: Fast lexical search to reduce the candidate pool.
    Operates on LangChain Document objects.
    """
    print(f"\n[Stage 1/4] Performing fast lexical pre-filtering on {len(all_documents)} documents...")
    if not all_documents:
        print("      > No documents to filter.")
        return []

    if len(all_documents) <= LEXICAL_SEARCH_TOP_K:
        print("      > Skipping pre-filtering, document count is already low.")
        return all_documents

    tokenized_corpus = [doc.page_content.lower().split() for doc in all_documents]
    bm25 = BM25Okapi(tokenized_corpus)
    doc_scores = bm25.get_scores(query.lower().split())

    scored_documents = [(all_documents[i], doc_scores[i]) for i in range(len(all_documents)) if doc_scores[i] > 0]

    if not scored_documents:
        print("      > No lexical matches found (all scores were zero).")
        return []

    scored_documents.sort(key=lambda x: x[1], reverse=True)
    pre_filtered_documents = [doc for doc, _ in scored_documents[:LEXICAL_SEARCH_TOP_K]]

    print(f"      > Reduced documents from {len(all_documents)} to {len(pre_filtered_documents)}.")
    return pre_filtered_documents


def identify_and_extract_constraints(persona: str, job_to_be_done: str) -> tuple[list[str], list[str]]:
    """
    Identifies key constraints (inclusionary and explicit exclusionary)
    from the persona and job description using the generative model.
    """
    print("\n[Stage 2a/4] Identifying constraints...")

    constraint_extraction_prompt = f'''Analyze the following request and extract ALL distinct key requirements and exclusions as a comma-separated list. Be precise and specific. Prioritize explicit mentions.

Example 1:
Request: "As a Food Contractor, I need to prepare a vegetarian buffet with gluten-free ingredients, but no nuts."
Constraints: vegetarian, gluten-free, no nuts

Example 2:
Request: "As a Legal Analyst, I need to find precedents on copyright law, excluding cases from the 2nd circuit."
Constraints: copyright law, excluding 2nd circuit

Request: "As a {persona}, I need to {job_to_be_done}"
Constraints:'''

    try:
        if generative_ct2_translator:
            input_tokens_str = generative_tokenizer.convert_ids_to_tokens(generative_tokenizer.encode(constraint_extraction_prompt, max_length=512, truncation=True))
            results = generative_ct2_translator.translate_batch(
                [input_tokens_str],
                max_decoding_length=100,
                beam_size=2,
                return_scores=False,
                repetition_penalty=1.0
            )
            raw_constraints_text = generative_tokenizer.decode(
                generative_tokenizer.convert_tokens_to_ids(results[0].hypotheses[0]),
                skip_special_tokens=True
            ).strip()
        else:
            inputs = generative_tokenizer(constraint_extraction_prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
            with torch.no_grad():
                constraint_ids = generative_model_pytorch_fallback.generate(
                    inputs.input_ids,
                    max_length=100,
                    num_beams=2,
                    early_stopping=True,
                    do_sample=False
                )
            raw_constraints_text = generative_tokenizer.decode(constraint_ids[0], skip_special_tokens=True).strip()

        all_constraints_list = [c.strip().lower() for c in raw_constraints_text.split(',') if c.strip()]

        inclusionary_constraints = []
        exclusionary_constraints = []

        explicit_exclusion_triggers = ["no ", "not ", "excluding", "without", "free from", "avoid", "don't use", "exclude", "except", "prohibit"]

        for constraint in all_constraints_list:
            is_explicit_exclusion = False
            for trigger in explicit_exclusion_triggers:
                if trigger in constraint:
                    exclusionary_constraints.append(constraint)
                    is_explicit_exclusion = True
                    break
            if not is_explicit_exclusion:
                inclusionary_constraints.append(constraint)

        print(f"      > All Raw Constraints: {all_constraints_list}")
        print(f"      > Inclusionary Constraints: {inclusionary_constraints}")
        print(f"      > Exclusionary Constraints (for LLM verification): {exclusionary_constraints}")

        return inclusionary_constraints, exclusionary_constraints
    except Exception as e:
        print(f"      > Could not identify constraints. Error: {e}")
        return [], []

def llm_based_verification_worker(lc_document: Document, persona: str, job_to_be_done: str,
                                  inclusionary_constraints: list[str], exclusionary_constraints: list[str]) -> tuple[bool, str, Document]:
    """
    Stage 3/4: Uses LLM for nuanced verification against specific constraints.
    Operates on LangChain Document objects.
    Returns (is_compliant, reason_if_not, original_lc_document).
    """
    inc_constraints_str = ", ".join(inclusionary_constraints) if inclusionary_constraints else "None"
    exc_constraints_str = ", ".join(exclusionary_constraints) if exclusionary_constraints else "None"

    section_title = lc_document.metadata.get("section_title", "N/A")
    content = lc_document.page_content
    document_name = lc_document.metadata.get("source", "N/A")
    page_number = lc_document.metadata.get("page_number", "N/A")

    prompt = f'''You are a highly analytical and strict verifier. A "{persona}" needs to "{job_to_be_done}".
Here are the specific requirements and exclusions:
Inclusionary constraints (Content MUST clearly support or be highly relevant to these): {inc_constraints_str}
Exclusionary constraints (Content MUST NOT violate or contradict these in any way): {exc_constraints_str}

You are given a section from a document:
Section Title: "{section_title}"
Content: "{content}"

Evaluate this section *strictly* based on its content:
1. Does this section directly and clearly address *all* aspects of the *inclusionary* constraints? (If 'None', this condition is met).
2. Is this section entirely free from *any* violation or contradiction of the *exclusionary* constraints? (If 'None', this condition is met).
3.Is this heading relevant to "{persona}" to perform "{job_to_be_done}"


Answer only "YES" if it satisfies all three conditions.
Answer "NO: " followed by a concise, specific reason if it fails either condition.
Example reasons: "NO: Does not contain vegetarian options.", "NO: Mentions nuts, which are excluded.", "NO: Irrelevant to copyright law."

Verification Output:'''

    try:
        if generative_ct2_translator:
            input_tokens_str = generative_tokenizer.convert_ids_to_tokens(generative_tokenizer.encode(prompt, max_length=1024, truncation=True))
            results = generative_ct2_translator.translate_batch(
                [input_tokens_str],
                max_decoding_length=70,
                beam_size=1, # Keep beam_size at 1 for strict, direct answers
                return_scores=False,
                repetition_penalty=1.0
            )
            verification_text = generative_tokenizer.decode(
                generative_tokenizer.convert_tokens_to_ids(results[0].hypotheses[0]),
                skip_special_tokens=True
            ).strip().lower()
        else:
            inputs = generative_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
            with torch.no_grad():
                verification_ids = generative_model_pytorch_fallback.generate(
                    inputs.input_ids, max_length=70, num_beams=1, do_sample=False, temperature=0.0
                )
            verification_text = generative_tokenizer.decode(verification_ids[0], skip_special_tokens=True).strip().lower()

        if verification_text.startswith("yes"):
            return True, None, lc_document
        else:
            reason = verification_text.replace("no:", "").strip() if verification_text.startswith("no:") else "LLM provided ambiguous or no specific reason."
            return False, reason, lc_document
    except Exception as e:
        print(f"          > Error during LLM nuance verification for '{section_title}' (Doc: {document_name}, Page {page_number}): {e}")
        return False, f"LLM verification failed: {e}", lc_document

def generate_helpful_analysis_worker(lc_document: Document, persona: str, job_to_be_done: str) -> dict:
    """
    This worker generates an explanation of how the chunk helps the persona,
    strictly based on the content, with dynamic length and hallucination prevention.
    Operates on LangChain Document objects.
    Returns a dictionary with 'document', 'refined_text', 'page_number', and 'id'.
    """
    global generative_tokenizer, generative_ct2_translator, generative_model_pytorch_fallback

    section_title = lc_document.metadata.get("section_title", "N/A")
    content = lc_document.page_content
    document_name = lc_document.metadata.get("source", "N/A")
    page_number = lc_document.metadata.get("page_number", "N/A")
    doc_id = lc_document.id # Get the document ID

    content_length = len(content.split())

    # Adjust min_summary_words and max_summary_words more aggressively for shorter content
    if content_length < 50:
        min_summary_words = 10
        max_summary_words = 30
        length_guidance = "Keep it very concise, reflecting only what's explicitly present. Aim for 10-30 words."
    elif content_length < 200:
        min_summary_words = 30
        max_summary_words = 70
        length_guidance = "Answer in some detail with good context, covering all key points present. Aim for 30-70 words."
    else:
        min_summary_words = 60
        max_summary_words = 120
        length_guidance = "Provide a comprehensive overview, covering everything present without inventing details. Aim for 60-120 words."

    # Convert word counts to token counts, assuming 1 word is approx 1.5 tokens for T5
    min_tokens = int(min_summary_words * 1.5)
    max_tokens = int(max_summary_words * 1.5)

    # Ensure min_tokens is at least a reasonable number and max_tokens isn't too small
    min_tokens = max(15, min_tokens)
    max_tokens = min(300, max(min_tokens + 10, max_tokens)) # Cap max_tokens to prevent excessively long generations

    prompt = f'''Your task is to act as an intelligent assistant for a "{persona}".
Their objective is: "{job_to_be_done}".

You are provided with a section from a document. Explain how this *specific* section is useful for the persona's objective.
Your explanation MUST be *strictly* based on the provided "Content" and should NOT introduce any outside information or make assumptions. Do NOT hallucinate.

If the "Content" is very short or provides limited information, keep your explanation equally brief and directly reflect the existing content without inventing details. Make sure to cover all points present.
{length_guidance}

Your response MUST start with the exact section title, followed by a colon, and then a brief, helpful explanation in a paragraph format covering everything present under that heading content section.

Example:
Persona: "Financial Analyst"
Objective: "Analyze the Q3 performance of tech companies."
Section Title: "Server Infrastructure Costs"
Content: "Our server costs increased by 20% to $5M due to expansion into the EU market. We anticipate these costs to level out in Q4."
Helpful Explanation: "Server Infrastructure Costs: This section is crucial as it quantifies a major operational expense ($5M), explains the 20% increase due to EU expansion, and provides a forecast for Q4, which is essential for analyzing Q3 profitability and predicting future performance, offering valuable insight for financial analysis."

Now, generate the helpful explanation for the following:
Persona: "{persona}"
Objective: "{job_to_be_done}"
Section Title: "{section_title}"
Content: "{content}"

Helpful Explanation: strictly adhere to what is provided and don't generate new info or hallucinate, make sure prompt itself is not repeated or any sort of repeaated sentences are present, high quality and no repitition of sentences, don't repeat the {job_to_be_done} itself'''
    try:
        if generative_ct2_translator:
            input_tokens_str = generative_tokenizer.convert_ids_to_tokens(generative_tokenizer.encode(prompt, max_length=1024, truncation=True))
            results = generative_ct2_translator.translate_batch(
                [input_tokens_str],
                max_decoding_length=max_tokens,
                min_decoding_length=min_tokens,
                beam_size=4,
                repetition_penalty=1.5, # Increased repetition penalty slightly
                return_scores=False
            )
            analysis_text = generative_tokenizer.decode(
                generative_tokenizer.convert_tokens_to_ids(results[0].hypotheses[0]),
                skip_special_tokens=True
            ).strip()
        else:
            inputs = generative_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
            with torch.no_grad():
                analysis_ids = generative_model_pytorch_fallback.generate(
                    inputs.input_ids,
                    max_length=max_tokens,
                    min_length=min_tokens,
                    length_penalty=2.0,
                    num_beams=4,
                    early_stopping=True,
                    no_repeat_ngram_size=2, # Added to prevent direct repetition of n-grams
                    repetition_penalty=1.5 # Consistent with CT2
                )
            analysis_text = generative_tokenizer.decode(analysis_ids[0], skip_special_tokens=True).strip()

        # Ensure the analysis starts with the section title for consistency
        if not analysis_text.lower().startswith(section_title.lower()):
            analysis_text = f"{section_title}: {analysis_text}"

        # Post-process to remove potential leading/trailing quotes or odd characters from generation
        analysis_text = analysis_text.strip(' "')

        if len(analysis_text.split()) < min_summary_words * 0.75 and content_length > 10: # Relaxed warning threshold slightly
            print(f"Warning: Generated analysis for '{section_title}' was unexpectedly short. Content length: {content_length} words, Summary length: {len(analysis_text.split())} words.")

        return {"document": document_name, "refined_text": analysis_text, "page_number": page_number, "id": doc_id}
    except Exception as e:
        print(f"Could not generate analysis for '{section_title}' (Doc: {document_name}, Page {page_number}): {e}")
        return {"document": document_name, "refined_text": f"{section_title}: Error generating analysis.", "page_number": page_number, "id": doc_id}


def main():
    """Main function to run the persona-centric analysis pipeline."""
    global GENERATIVE_MODEL_QUANTIZATION

    ct2_target_path = CT2_MODEL_DIR / GENERATIVE_MODEL_QUANTIZATION

    if GENERATIVE_MODEL_QUANTIZATION and not ct2_target_path.is_dir():
        print("Pre-checking and attempting CTranslate2 conversion...")
        conversion_successful = convert_and_quantize_flan_t5(GENERATIVE_MODEL_NAME, ct2_target_path, GENERATIVE_MODEL_QUANTIZATION)
        if not conversion_successful:
            print("❗ CTranslate2 conversion failed. Falling back to PyTorch CPU model.")
            GENERATIVE_MODEL_QUANTIZATION = None

        if conversion_successful and HF_MODEL_SOURCE_PATH.is_dir():
            print(f"Conversion successful. Cleaning up Hugging Face source model directory: {HF_MODEL_SOURCE_PATH}")
            # shutil.rmtree(HF_MODEL_SOURCE_PATH) # Keep source for potential future use or debugging

    load_models() # This now also initializes the LangChainFAISS vector store
    print("\n" + "="*50 + "\nChallenge 1b: Persona-Driven Document Intelligence\n" + "="*50)

    persona = input("Enter the Persona (e.g., 'Food Contractor'): ").strip()
    job_to_be_done = input("Enter the Job-to-be-Done (e.g., 'Prepare a vegetarian buffet'): ").strip()
    if not persona or not job_to_be_done:
        print("❌ Persona and Job-to-be-Done are required.")
        return

    input_directory = Path("/app/input") if Path("/app/input").is_dir() else Path("input")
    if not input_directory.is_dir():
        print(f"❌ Input directory '{input_directory}' not found. Creating it now.")
        input_directory.mkdir(parents=True, exist_ok=True)
        # You might want to exit here or provide dummy PDFs for testing
        print("Please place your PDF documents in the 'input' directory.")
        return

    pdf_files = [f for f in input_directory.iterdir() if f.is_file() and f.suffix.lower() == '.pdf']
    if not pdf_files:
        print(f"❌ No PDF files found in '{input_directory}'. Please add PDF files to process.")
        return

    start_time = datetime.datetime.now()

    print(f"\n[Ongoing] Extracting content from {len(pdf_files)} documents and indexing to FAISS...")
    all_lc_documents = []
    # Use ThreadPoolExecutor for IO-bound tasks like PDF parsing
    with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor: # Increased workers for I/O bound
        future_to_pdf = {executor.submit(extract_content_chunks, pdf): pdf for pdf in pdf_files}
        for future in as_completed(future_to_pdf):
            try:
                extracted_lc_documents = future.result()
                if extracted_lc_documents:
                    all_lc_documents.extend(extracted_lc_documents)
                    _add_documents_to_faiss(extracted_lc_documents, langchain_vector_store)
            except Exception as exc:
                print(f'{future_to_pdf[future].name} generated an exception: {exc}')


    if not all_lc_documents:
        print("❌ Could not extract any content from PDFs.")
        return
    print(f"      > Total extracted {len(all_lc_documents)} LangChain Document sections.")

    search_query = f"{persona} {job_to_be_done}"

    candidate_lc_documents_lexical = lexical_pre_filter(all_lc_documents, search_query)

    if not candidate_lc_documents_lexical:
        print("\n❌ No relevant content found after lexical pre-filtering.")
        return

    # We should perform semantic search on the *lexically pre-filtered* documents,
    # not necessarily all documents if lexical filtering significantly reduces the set.
    # However, FAISS `similarity_search_with_score` operates on the entire indexed store.
    # So, the current flow of semantic search on the full FAISS DB and then filtering
    # is conceptually okay, but the semantic search input should ideally be from the
    # refined set to make sense in a multi-stage funnel.
    # For now, keeping the `_query_faiss_documents` as is, which queries the whole DB.
    # A more advanced fusion would be to combine scores or re-rank.
    semantically_ranked_lc_documents_with_scores = _query_faiss_documents(search_query, SEMANTIC_SEARCH_TOP_N)

    if not semantically_ranked_lc_documents_with_scores:
        print("\n❌ No relevant content found after semantic ranking.")
        return

    # Filter semantic results to ensure they were also in the lexically pre-filtered set
    lexical_ids = {doc.id for doc in candidate_lc_documents_lexical}
    semantically_ranked_lc_documents = [doc for doc, score in semantically_ranked_lc_documents_with_scores if doc.id in lexical_ids]

    if not semantically_ranked_lc_documents:
        print("\n❌ No relevant content found after combining lexical and semantic filtering.")
        return

    print(f"\n[Stage 2/4] Beginning deep analysis on {len(semantically_ranked_lc_documents)} candidates...")

    inclusionary_constraints, exclusionary_constraints = identify_and_extract_constraints(persona, job_to_be_done)

    print(f"\n[Stage 3/4] Performing LLM-based nuanced verification on top {len(semantically_ranked_lc_documents)} documents (in parallel)...")
    verified_final_lc_documents = []
    # Use ThreadPoolExecutor for CPU-bound tasks like LLM inference (if using local models, especially CT2)
    # The original code uses ThreadPoolExecutor for this, which is fine.
    with ThreadPoolExecutor(max_workers=os.cpu_count() or 1) as executor:
        future_to_doc_verification = {
            executor.submit(llm_based_verification_worker, doc, persona, job_to_be_done,
                            inclusionary_constraints, exclusionary_constraints): doc
            for doc in semantically_ranked_lc_documents
        }
        for future in as_completed(future_to_doc_verification):
            try:
                is_compliant, reason, original_lc_document = future.result()
                if is_compliant:
                    verified_final_lc_documents.append(original_lc_document)
                else:
                    doc_title = original_lc_document.metadata.get("section_title", "N/A")
                    doc_name = original_lc_document.metadata.get("source", "N/A")
                    page_num = original_lc_document.metadata.get("page_number", "N/A")
                    print(f"      > LLM Rejected: '{doc_title}' (Doc: {doc_name}, Page {page_num}) - Reason: {reason}")
            except Exception as exc:
                # Capture the document that caused the error for better debugging
                problematic_doc = future_to_doc_verification[future]
                print(f"Error processing verification for doc '{problematic_doc.metadata.get('section_title', 'Unknown')}' from '{problematic_doc.metadata.get('source', 'Unknown')}': {exc}")


    original_scores = {doc.id: score for doc, score in semantically_ranked_lc_documents_with_scores}

    # Sort the verified documents by their original semantic score to maintain relevance ranking
    verified_final_lc_documents.sort(key=lambda doc: original_scores.get(doc.id, 0), reverse=True)

    top_final_lc_documents = verified_final_lc_documents[:FINAL_TOP_K]


    if not top_final_lc_documents:
        print("\n❌ No compliant content found after final deep verification by LLM. This indicates that even the relevant content did not meet all explicit constraints.")
        return

    print(f"\n[Stage 4/4] Generating helpful analysis for top {len(top_final_lc_documents)} documents (in parallel)...")
    subsection_analysis_unordered = []
    # Use ThreadPoolExecutor for CPU-bound tasks like LLM inference
    with ThreadPoolExecutor(max_workers=os.cpu_count() or 1) as executor:
        future_to_doc_analysis = {executor.submit(generate_helpful_analysis_worker, doc, persona, job_to_be_done): doc for doc in top_final_lc_documents}
        for future in as_completed(future_to_doc_analysis):
            try:
                subsection_analysis_unordered.append(future.result())
            except Exception as exc:
                problematic_doc = future_to_doc_analysis[future]
                doc_title = problematic_doc.metadata.get("section_title", "N/A")
                doc_name = problematic_doc.metadata.get("source", "N/A")
                page_num = problematic_doc.metadata.get("page_number", "N/A")
                print(f"Error generating analysis for doc '{doc_title}' from '{doc_name}' (Page {page_num}): {exc}")
                # Append a placeholder for the failed analysis
                subsection_analysis_unordered.append({
                    "document": doc_name,
                    "refined_text": f"{doc_title}: Error generating analysis due to an internal issue.",
                    "page_number": page_num,
                    "id": problematic_doc.id # Ensure ID is present even on error
                })

    # Sort the final analysis results to match the order of top_final_lc_documents
    subsection_analysis = []
    for lc_doc in top_final_lc_documents:
        doc_id_from_lc = lc_doc.id # Use doc.id, which is guaranteed to be present
        found_analysis = next((analysis for analysis in subsection_analysis_unordered
                                 if analysis.get('id') == doc_id_from_lc), None)

        if found_analysis:
            subsection_analysis.append(found_analysis)
        else:
            doc_title_fallback = lc_doc.metadata.get('section_title', f"Doc ID: {lc_doc.id}")
            doc_name_fallback = lc_doc.metadata.get('source', 'Unknown Document')
            page_num_fallback = lc_doc.metadata.get('page_number', 'Unknown Page')
            subsection_analysis.append({
                "document": doc_name_fallback,
                "refined_text": f"{doc_title_fallback}: [Analysis Missing or Error for original content: {lc_doc.page_content[:50]}...]",
                "page_number": page_num_fallback,
                "id": lc_doc.id # Crucial to include the ID here
            })


    print("\n[Done] Assembling final output...")
    extracted_sections = []
    for i, lc_doc in enumerate(top_final_lc_documents):
        extracted_sections.append({
            "document": lc_doc.metadata.get("source"),
            "section_title": lc_doc.metadata.get("section_title"),
            "importance_rank": i+1,
            "page_number": lc_doc.metadata.get("page_number")
        })

    final_output = {
        "metadata": {"input_documents": sorted([str(f.name) for f in pdf_files]), "persona": persona, "job_to_be_done": job_to_be_done, "processing_timestamp": datetime.datetime.now().isoformat()},
        "extracted_sections": extracted_sections,
        "subsection_analysis": subsection_analysis
    }

    end_time = datetime.datetime.now()
    print(f"\n--- Processing Complete in {(end_time - start_time).total_seconds():.2f} seconds ---")

    output_directory = Path("/app/output") if Path("/app/output").is_dir() else Path("output")
    output_directory.mkdir(parents=True, exist_ok=True)
    output_filename = output_directory / "challenge1b_output.json"

    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(final_output, f, indent=4, ensure_ascii=False)

    print(f"✓ Results saved to {output_filename}")
    print("\n" + "="*20 + " FINAL JSON OUTPUT " + "="*20)
    print(json.dumps(final_output, indent=2, ensure_ascii=False))
    print("="*59)

if __name__ == "__main__":
    main()

Pre-checking and attempting CTranslate2 conversion...
Attempting to convert google/flan-t5-base to CTranslate2 int8 format...
Hugging Face source model not found at ct2_models/hf_model_source. Saving now...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Hugging Face model saved to ct2_models/hf_model_source.
Running conversion command: ct2-transformers-converter --model ct2_models/hf_model_source --output_dir ct2_models/int8 --quantization int8 --copy_files tokenizer.json tokenizer_config.json special_tokens_map.json --force
Conversion to int8 complete and saved to ct2_models/int8.
Conversion successful. Cleaning up Hugging Face source model directory: ct2_models/hf_model_source
Using device: CPU
Loading retrieval embedding model (via LangChain HuggingFaceEmbeddings): intfloat/e5-small...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Retrieval embedding model loaded successfully.
Loading generative model: google/flan-t5-base...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Attempting to load CTranslate2 model from ct2_models/int8...
CTranslate2 model (int8) loaded successfully.
Generative model pipeline loaded successfully.
Clearing existing FAISS database directory: faiss_index_db to create a new database.
FAISS_DB_PATH 'faiss_index_db' is already empty or does not exist. Proceeding.
Creating a new FAISS index.


  return forward_call(*args, **kwargs)


Successfully removed dummy document with ID: 4571cba4-5079-4b2b-92a4-1168ad0ee084
LangChain FAISS vector store initialized successfully.

Challenge 1b: Persona-Driven Document Intelligence
Enter the Persona (e.g., 'Food Contractor'): HR porfessional
Enter the Job-to-be-Done (e.g., 'Prepare a vegetarian buffet'): Create and manage fillable forms for onboarding and compliance

[Ongoing] Extracting content from 15 documents and indexing to FAISS...
      > Adding 3 new documents to FAISS vector store...
      > FAISS index and metadata saved to faiss_index_db.
      > Successfully added 3 documents to FAISS. Total in DB: 3
      > Adding 20 new documents to FAISS vector store...
      > FAISS index and metadata saved to faiss_index_db.
      > Successfully added 20 documents to FAISS. Total in DB: 23
      > Adding 20 new documents to FAISS vector store...
      > FAISS index and metadata saved to faiss_index_db.
      > Successfully added 20 documents to FAISS. Total in DB: 43
      > Ad