In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path modified: {str(project_root) in sys.path}")

try:
    import client
    import config
    import models
    import parsers
    import nlp
    print("All project modules are now accessible")
except ModuleNotFoundError as e:
    print(f"Module import failed: {e}")
    raise

In [2]:
from client.arxiv import ArXivClient
from client.grobid import GROBIDClient
from models.grobid import Form, File
from parsers.tei import Parser
from config.llm import LangExtractConfig, GeminiConfig
from config.nlp import NLPConfig
from nlp.structural import SectionProcessor
from nlp.semantic import SemanticExtractor
from nlp.syntactic import parse
from nlp.entity_pairs import create_pairs, filter_by_type
from nlp.relation import RelationExtractor
from utils.clean_text import preprocess_section
import json
import os
from pathlib import Path

In [3]:
arxiv_id = "2103.15348"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [4]:
arxiv_client = ArXivClient()
metadata = arxiv_client.get_metadata(arxiv_id)
pdf_path = output_dir / f"{arxiv_id}.pdf"
arxiv_client.download_pdf(arxiv_id, str(pdf_path))

In [5]:
grobid_client = GROBIDClient()
with open(pdf_path, "rb") as f:
    pdf_bytes = f.read()

form = Form(
    file=File(payload=pdf_bytes, file_name=f"{arxiv_id}.pdf"),
    consolidate_citations=1,
    consolidate_header=1,
    segment_sentences=True
)

response = grobid_client.process_pdf(form)
tei_path = output_dir / f"{arxiv_id}.tei.xml"
tei_path.write_bytes(response.content)

90851

In [6]:
parser = Parser(response.content)
article = parser.parse()

In [7]:
sections_data = []
for section in article.sections:
    section_text = ""
    for paragraph in section.paragraphs:
        section_text += paragraph.plain_text + " "
    
    clean_text = preprocess_section(section_text.strip())
    sections_data.append({
        "title": section.title,
        "raw_text": section_text.strip(),
        "clean_text": clean_text
    })

In [8]:
langextract_config = LangExtractConfig()
gemini_config = GeminiConfig()
nlp_config = NLPConfig()
semantic_extractor = SemanticExtractor(langextract_config)

In [9]:
import langextract as lx
from llm.prompts.langextract import PROMPT, EXAMPLES
from config.nlp import normalize_section

all_entities = []

for section_data in sections_data:
    normalized_title = normalize_section(section_data["title"], nlp_config.patterns)
    section_config = nlp_config.sections.get(normalized_title, nlp_config.sections["default"])
    
    result = lx.extract(
        text_or_documents=section_data["clean_text"],
        prompt_description=PROMPT,
        examples=EXAMPLES,
        model_id=langextract_config.model_id,
        api_key=langextract_config.api_key,
        extraction_passes=section_config.extraction_passes,
        max_workers=langextract_config.max_workers,
        max_char_buffer=section_config.max_char_buffer,
    )
    
    section_entities = []
    for extraction in result.extractions:
        entity_dict = {
            "text": extraction.extraction_text,
            "type": extraction.extraction_class,
            "char_interval": (
                {
                    "start_pos": extraction.char_interval.start_pos,
                    "end_pos": extraction.char_interval.end_pos,
                }
                if extraction.char_interval
                else None
            ),
            "section": section_data["title"]
        }
        section_entities.append(entity_dict)
    
    title = section_data["title"].replace(" ", "_").replace("/", "_")
    section_json = json.dumps(section_entities, indent=2)
    section_path = output_dir / f"{arxiv_id}_{title}_entities.json"
    section_path.write_text(section_json)
    
    all_entities.extend(section_entities)

In [10]:
all_entities_json = json.dumps(all_entities, indent=2)
all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
all_entities_path.write_text(all_entities_json)

104995

In [11]:
entities_by_section = {}
for entity in all_entities:
    section = entity["section"]
    if section not in entities_by_section:
        entities_by_section[section] = []
    entities_by_section[section].append(entity)

In [12]:
relation_extractor = RelationExtractor(gemini_config)

In [21]:
all_relations = []
MAX_PAIRS_PER_SECTION = 20

for section_idx, (section_title, section_entities) in enumerate(entities_by_section.items()):
    if len(section_entities) < 2:
        continue
    
    print(f"\n[Section {section_idx+1}/{len(entities_by_section)}] {section_title}")
    print(f"  Entities: {len(section_entities)}")
    
    section_data = next(s for s in sections_data if s["title"] == section_title)
    doc = parse(section_data["clean_text"])
    
    pairs_processed = 0
    relations_found = 0
    
    for i, e1 in enumerate(section_entities):
        if pairs_processed >= MAX_PAIRS_PER_SECTION:
            break
            
        for e2 in section_entities[i+1:min(i+6, len(section_entities))]:
            if pairs_processed >= MAX_PAIRS_PER_SECTION:
                break
                
            type_tuple = (e1["type"].upper(), e2["type"].upper())
            if type_tuple not in VALID_TYPE_PAIRS:
                continue
            
            span1 = find_entity_in_doc(e1["text"], doc)
            span2 = find_entity_in_doc(e2["text"], doc)
            
            syntax = "no pattern"
            sentence = section_data["clean_text"][:500]
            
            if span1 and span2 and span1.sent == span2.sent:
                syntax = verbalize_path(span1.root, span2.root)
                sentence = span1.sent.text
            
            pair = {
                "head": {"text": e1["text"], "type": e1["type"].upper()},
                "tail": {"text": e2["text"], "type": e2["type"].upper()},
                "sentence": sentence,
                "syntax": syntax
            }
            
            result = relation_extractor._classify(pair)
            pairs_processed += 1
            
            print(f"  [{pairs_processed}/{MAX_PAIRS_PER_SECTION}] {e1['text'][:20]} -> {e2['text'][:20]}", end="")
            
            if result["relation"] != "NONE":
                relations_found += 1
                print(f" ✓ {result['relation']}")
                
                all_relations.append({
                    "head": e1["text"],
                    "head_type": e1["type"],
                    "tail": e2["text"],
                    "tail_type": e2["type"],
                    "relation": result["relation"],
                    "confidence": result["confidence"],
                    "section": section_title,
                    "evidence": sentence,
                    "syntax": syntax,
                    "reasoning": result.get("reasoning", "")
                })
            else:
                print(" ✗")
    
    print(f"  Found: {relations_found} relations")


[Section 1/13] Introduction
  Entities: 80
  [1/20] Deep Learning -> document image analy ✓ used_for
  [2/20] Deep Learning -> document image class ✓ used_for
  [3/20] Deep Learning -> layout detection ✓ used_for
  [4/20] Deep Learning -> table detection ✓ used_for
  [5/20] Deep Learning -> scene text detection ✓ used_for
  [6/20] table detection -> document digitizatio ✗
  [7/20] scene text detection -> document digitizatio ✗
  [8/20] A generalized learni -> complicated rules ✓ improves_upon
  [9/20] A generalized learni -> traditional methods ✓ improves_upon
  [10/20] A generalized learni -> document digitizatio ✗
  [11/20] A generalized learni -> Tensor Flow ✗
  [12/20] complicated rules -> document digitizatio ✗
  [13/20] complicated rules -> Tensor Flow ✗
  [14/20] complicated rules -> PyTorch ✗
  [15/20] traditional methods -> document digitizatio ✗
  [16/20] traditional methods -> Tensor Flow ✓ compared_with
  [17/20] traditional methods -> PyTorch ✗
  [18/20] Existing models -

In [None]:
relations_json = json.dumps(all_relations, indent=2)
relations_path = output_dir / f"{arxiv_id}_relations.json"
relations_path.write_text(relations_json)

In [23]:
all_entities

[{'text': 'Deep Learning',
  'type': 'method',
  'char_interval': {'start_pos': 0, 'end_pos': 13},
  'section': 'Introduction'},
 {'text': 'document image analysis',
  'type': 'task',
  'char_interval': {'start_pos': 80, 'end_pos': 103},
  'section': 'Introduction'},
 {'text': 'document image classification',
  'type': 'task',
  'char_interval': {'start_pos': 126, 'end_pos': 155},
  'section': 'Introduction'},
 {'text': 'layout detection',
  'type': 'task',
  'char_interval': {'start_pos': 204, 'end_pos': 220},
  'section': 'Introduction'},
 {'text': 'table detection',
  'type': 'task',
  'char_interval': {'start_pos': 229, 'end_pos': 244},
  'section': 'Introduction'},
 {'text': 'scene text detection',
  'type': 'task',
  'char_interval': {'start_pos': 254, 'end_pos': 274},
  'section': 'Introduction'},
 {'text': 'A generalized learning-based framework',
  'type': 'method',
  'char_interval': {'start_pos': 279, 'end_pos': 317},
  'section': 'Introduction'},
 {'text': 'complicated rule

In [24]:
all_relations

[{'head': 'Deep Learning',
  'head_type': 'method',
  'tail': 'document image analysis',
  'tail_type': 'task',
  'relation': 'used_for',
  'confidence': 'HIGH',
  'section': 'Introduction',
  'evidence': 'Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification[11, arXiv:2103.15348v2[cs.',
  'syntax': "using 'for'",
  'reasoning': "The context states 'Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks', directly indicating that Deep Learning is used to perform or solve document image analysis tasks."},
 {'head': 'Deep Learning',
  'head_type': 'method',
  'tail': 'document image classification',
  'tail_type': 'task',
  'relation': 'used_for',
  'confidence': 'HIGH',
  'section': 'Introduction',
  'evidence': 'Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks