In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path modified: {str(project_root) in sys.path}")

try:
    import client
    import config
    import models
    import parsers
    import nlp
    print("All project modules are now accessible")
except ModuleNotFoundError as e:
    print(f"Module import failed: {e}")
    raise

In [2]:
from client.arxiv import ArXivClient
from client.grobid import GROBIDClient
from models.grobid import Form, File
from parsers.tei import Parser
from config.llm import LangExtractConfig, GeminiConfig
from config.nlp import NLPConfig
from nlp.structural import SectionProcessor
from nlp.semantic import SemanticExtractor
from nlp.syntactic import parse
from nlp.entity_pairs import create_pairs, filter_by_type
from nlp.relation import RelationExtractor
from utils.clean_text import preprocess_section
import json
import os
from pathlib import Path

In [3]:
arxiv_id = "1706.03762"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [4]:
arxiv_client = ArXivClient()
metadata = arxiv_client.get_metadata(arxiv_id)
pdf_path = output_dir / f"{arxiv_id}.pdf"
arxiv_client.download_pdf(arxiv_id, str(pdf_path))

In [5]:
grobid_client = GROBIDClient()
with open(pdf_path, "rb") as f:
    pdf_bytes = f.read()

form = Form(
    file=File(payload=pdf_bytes, file_name=f"{arxiv_id}.pdf"),
    consolidate_citations=1,
    consolidate_header=1,
    segment_sentences=True
)

response = grobid_client.process_pdf(form)
tei_path = output_dir / f"{arxiv_id}.tei.xml"
tei_path.write_bytes(response.content)

87297

In [6]:
parser = Parser(response.content)
article = parser.parse()

In [7]:
sections_data = []
for section in article.sections:
    section_text = ""
    for paragraph in section.paragraphs:
        section_text += paragraph.plain_text + " "
    
    clean_text = preprocess_section(section_text.strip())
    sections_data.append({
        "title": section.title,
        "raw_text": section_text.strip(),
        "clean_text": clean_text
    })

In [8]:
langextract_config = LangExtractConfig()
gemini_config = GeminiConfig()
nlp_config = NLPConfig()
semantic_extractor = SemanticExtractor(langextract_config)

In [None]:
import langextract as lx
from llm.prompts.langextract import PROMPT, EXAMPLES
from config.nlp import normalize_section

all_entities = []

for section_data in sections_data:
    normalized_title = normalize_section(section_data["title"], nlp_config.patterns)
    section_config = nlp_config.sections.get(normalized_title, nlp_config.sections["default"])
    
    result = lx.extract(
        text_or_documents=section_data["clean_text"],
        prompt_description=PROMPT,
        examples=EXAMPLES,
        model_id=langextract_config.model_id,
        api_key=langextract_config.api_key,
        extraction_passes=section_config.extraction_passes,
        max_workers=langextract_config.max_workers,
        max_char_buffer=section_config.max_char_buffer,
    )
    
    section_entities = []
    for extraction in result.extractions:
        entity_dict = {
            "text": extraction.extraction_text,
            "type": extraction.extraction_class,
            "char_interval": (
                {
                    "start_pos": extraction.char_interval.start_pos,
                    "end_pos": extraction.char_interval.end_pos,
                }
                if extraction.char_interval
                else None
            ),
            "section": section_data["title"]
        }
        section_entities.append(entity_dict)
    
    title = section_data["title"].replace(" ", "_").replace("/", "_")
    section_json = json.dumps(section_entities, indent=2)
    section_path = output_dir / f"{arxiv_id}_{title}_entities.json"
    section_path.write_text(section_json)
    
    all_entities.extend(section_entities)

In [10]:
all_entities_json = json.dumps(all_entities, indent=2)
all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
all_entities_path.write_text(all_entities_json)

58029

In [14]:
entities_by_section = {}
for entity in all_entities:
    section = entity["section"]
    if section not in entities_by_section:
        entities_by_section[section] = []
    entities_by_section[section].append(entity)

In [15]:
relation_extractor = RelationExtractor(gemini_config)

In [17]:
from spacy.tokens import Doc, Span

def find_entity_in_doc(entity_text: str, doc: Doc) -> Span | None:
    """Find entity by text matching instead of character positions."""
    entity_lower = entity_text.lower()
    
    for sent in doc.sents:
        for i in range(len(sent)):
            for j in range(i + 1, min(i + 10, len(sent) + 1)):
                span = sent[i:j]
                if span.text.lower() == entity_lower or entity_lower in span.text.lower():
                    return span
    return None

In [20]:
from nlp.entity_pairs import VALID_TYPE_PAIRS
from nlp.syntactic import find_sdp, verbalize_path, parse

all_relations = []
MAX_PAIRS_PER_SECTION = 20

for section_idx, (section_title, section_entities) in enumerate(entities_by_section.items()):
    if len(section_entities) < 2:
        continue
    
    print(f"\n[Section {section_idx+1}/{len(entities_by_section)}] {section_title}")
    print(f"  Entities: {len(section_entities)}")
    
    section_data = next(s for s in sections_data if s["title"] == section_title)
    doc = parse(section_data["clean_text"])
    
    pairs_processed = 0
    relations_found = 0
    
    for i, e1 in enumerate(section_entities):
        if pairs_processed >= MAX_PAIRS_PER_SECTION:
            break
            
        for e2 in section_entities[i+1:min(i+6, len(section_entities))]:
            if pairs_processed >= MAX_PAIRS_PER_SECTION:
                break
                
            type_tuple = (e1["type"].upper(), e2["type"].upper())
            if type_tuple not in VALID_TYPE_PAIRS:
                continue
            
            span1 = find_entity_in_doc(e1["text"], doc)
            span2 = find_entity_in_doc(e2["text"], doc)
            
            syntax = "no pattern"
            sentence = section_data["clean_text"][:500]
            
            if span1 and span2 and span1.sent == span2.sent:
                syntax = verbalize_path(span1.root, span2.root)
                sentence = span1.sent.text
            
            pair = {
                "head": {"text": e1["text"], "type": e1["type"].upper()},
                "tail": {"text": e2["text"], "type": e2["type"].upper()},
                "sentence": sentence,
                "syntax": syntax
            }
            
            result = relation_extractor._classify(pair)
            pairs_processed += 1
            
            print(f"  [{pairs_processed}/{MAX_PAIRS_PER_SECTION}] {e1['text'][:20]} -> {e2['text'][:20]}", end="")
            
            if result["relation"] != "NONE":
                relations_found += 1
                print(f" ✓ {result['relation']}")
                
                all_relations.append({
                    "head": e1["text"],
                    "head_type": e1["type"],
                    "tail": e2["text"],
                    "tail_type": e2["type"],
                    "relation": result["relation"],
                    "confidence": result["confidence"],
                    "section": section_title,
                    "evidence": sentence,
                    "syntax": syntax,
                    "reasoning": result.get("reasoning", "")
                })
            else:
                print(" ✗")
    
    print(f"  Found: {relations_found} relations")


[Section 1/17] Introduction
  Entities: 44
  [1/20] Recurrent neural net -> long short-term memo ✗
  [2/20] Recurrent neural net -> gated recurrent ✗
  [3/20] Recurrent neural net -> sequence modeling ✓ used_for
  [4/20] Recurrent neural net -> transduction problem ✓ used_for
  [5/20] Recurrent neural net -> language modeling ✓ used_for
  [6/20] long short-term memo -> gated recurrent ✓ compared_with
  [7/20] long short-term memo -> sequence modeling ✓ used_for
  [8/20] long short-term memo -> transduction problem ✓ used_for
  [9/20] long short-term memo -> language modeling ✓ used_for
  [10/20] long short-term memo -> machine translation ✓ used_for
  [11/20] gated recurrent -> sequence modeling ✓ used_for
  [12/20] gated recurrent -> transduction problem ✓ used_for
  [13/20] gated recurrent -> language modeling ✓ used_for
  [14/20] gated recurrent -> machine translation ✓ used_for
  [15/20] gated recurrent -> recurrent language m ✗
  [16/20] language modeling -> sequences ✓ applied_t

In [21]:
relations_json = json.dumps(all_relations, indent=2)
relations_path = output_dir / f"{arxiv_id}_relations.json"
relations_path.write_text(relations_json)

126532

In [22]:
all_entities

[{'text': 'Recurrent neural networks',
  'type': 'method',
  'char_interval': {'start_pos': 0, 'end_pos': 25},
  'section': 'Introduction'},
 {'text': 'long short-term memory',
  'type': 'method',
  'char_interval': {'start_pos': 27, 'end_pos': 49},
  'section': 'Introduction'},
 {'text': 'gated recurrent',
  'type': 'method',
  'char_interval': {'start_pos': 58, 'end_pos': 73},
  'section': 'Introduction'},
 {'text': 'sequence modeling',
  'type': 'task',
  'char_interval': {'start_pos': 171, 'end_pos': 188},
  'section': 'Introduction'},
 {'text': 'transduction problems',
  'type': 'task',
  'char_interval': {'start_pos': 193, 'end_pos': 214},
  'section': 'Introduction'},
 {'text': 'language modeling',
  'type': 'task',
  'char_interval': {'start_pos': 223, 'end_pos': 240},
  'section': 'Introduction'},
 {'text': 'machine translation',
  'type': 'task',
  'char_interval': {'start_pos': 245, 'end_pos': 264},
  'section': 'Introduction'},
 {'text': 'recurrent language models',
  'type

In [23]:
all_relations

[{'head': 'Recurrent neural networks',
  'head_type': 'method',
  'tail': 'sequence modeling',
  'tail_type': 'task',
  'relation': 'used_for',
  'confidence': 'HIGH',
  'section': 'Introduction',
  'evidence': 'Recurrent neural networks, long short-term memory[13] and gated recurrent[7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation[35,2,5].',
  'syntax': "via 'established'; using 'as'",
  'reasoning': 'The text states that Recurrent neural networks are established as approaches in sequence modeling, indicating that RNNs are used for the task of sequence modeling.'},
 {'head': 'Recurrent neural networks',
  'head_type': 'method',
  'tail': 'transduction problems',
  'tail_type': 'task',
  'relation': 'used_for',
  'confidence': 'HIGH',
  'section': 'Introduction',
  'evidence': 'Recurrent neural networks, long short-term memory[13] and gated recu