In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path modified: {str(project_root) in sys.path}")

try:
    import client
    import config
    import models
    import parsers
    import nlp
    print("All project modules are now accessible")
except ModuleNotFoundError as e:
    print(f"Module import failed: {e}")
    raise

In [2]:
import sys
from pathlib import Path
import json
from collections import Counter

from client.arxiv import ArXivClient
from client.grobid import GROBIDClient
from models.grobid import Form, File
from parsers.tei import Parser
from config.llm import LangExtractConfig, GeminiConfig
from config.nlp import NLPConfig
from nlp.structural import SectionProcessor
from nlp.semantic import SemanticExtractor
from nlp.syntactic import parse
from utils.clean_text import preprocess_section

from nlp.entity_filter import (
    normalize_text,
    should_merge,
    filter_pipeline,
    analyze_impact,
    nlp,
    FilterConfig
)

In [None]:
arxiv_id = "1706.03762"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [None]:
arxiv_client = ArXivClient()
metadata = arxiv_client.get_metadata(arxiv_id)
pdf_path = output_dir / f"{arxiv_id}.pdf"
arxiv_client.download_pdf(arxiv_id, str(pdf_path))

In [None]:
grobid_client = GROBIDClient()
with open(pdf_path, "rb") as f:
    pdf_bytes = f.read()

form = Form(
    file=File(payload=pdf_bytes, file_name=f"{arxiv_id}.pdf"),
    consolidate_citations=1,
    consolidate_header=1,
    segment_sentences=True
)

response = grobid_client.process_pdf(form)
tei_path = output_dir / f"{arxiv_id}.tei.xml"
tei_path.write_bytes(response.content)

In [None]:
parser = Parser(response.content)
article = parser.parse()

In [None]:
sections_data = []
for section in article.sections:
    section_text = ""
    for paragraph in section.paragraphs:
        section_text += paragraph.plain_text + " "
    
    clean_text = preprocess_section(section_text.strip())
    sections_data.append({
        "title": section.title,
        "raw_text": section_text.strip(),
        "clean_text": clean_text
    })

In [None]:
langextract_config = LangExtractConfig()
gemini_config = GeminiConfig()
nlp_config = NLPConfig()

In [None]:
import langextract as lx
from llm.prompts.langextract import PROMPT, EXAMPLES
from config.nlp import normalize_section

all_entities = []

for section_data in sections_data:
    normalized_title = normalize_section(section_data["title"], nlp_config.patterns)
    section_config = nlp_config.sections.get(normalized_title, nlp_config.sections["default"])
    
    result = lx.extract(
        text_or_documents=section_data["clean_text"],
        prompt_description=PROMPT,
        examples=EXAMPLES,
        model_id=langextract_config.model_id,
        api_key=langextract_config.api_key,
        extraction_passes=section_config.extraction_passes,
        max_workers=langextract_config.max_workers,
        max_char_buffer=section_config.max_char_buffer,
    )
    
    section_entities = []
    for extraction in result.extractions:
        entity_dict = {
            "text": extraction.extraction_text,
            "type": extraction.extraction_class,
            "char_interval": (
                {
                    "start_pos": extraction.char_interval.start_pos,
                    "end_pos": extraction.char_interval.end_pos,
                }
                if extraction.char_interval
                else None
            ),
            "section": section_data["title"]
        }
        section_entities.append(entity_dict)
    
    title = section_data["title"].replace(" ", "_").replace("/", "_")
    section_json = json.dumps(section_entities, indent=2)
    section_path = output_dir / f"{arxiv_id}_{title}_entities.json"
    section_path.write_text(section_json)
    
    all_entities.extend(section_entities)

In [None]:
full_clean_text = "\n\n".join(s["clean_text"] for s in sections_data)
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"
full_text_path.write_text(full_clean_text)

all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
all_entities_json = json.dumps(all_entities, indent=2)
all_entities_path.write_text(all_entities_json)

print(f"Full text saved to: {full_text_path}")
print(f"All entities saved to: {all_entities_path}")

In [3]:
def print_results(entities: list[dict], top_n: int = 20):
    type_dist = Counter(e["type"] for e in entities)
    print(f"Type distribution: {dict(type_dist)}\n")
    
    print(f"Top {min(top_n, len(entities))} entities:")
    for i, e in enumerate(entities[:top_n], 1):
        score = e.get("pr_score", 0)
        print(f"  {i:2d}. {e['text']:<40} {e['type']:<8} {score:.4f}")

def validate_results(entities: list[dict]) -> bool:
    entity_texts = {e["text"] for e in entities}
    
    checks = [
        ("WMT 2014 English-German dataset" in entity_texts, "German dataset preserved"),
        ("WMT 2014 English-French dataset" in entity_texts, "French datasets preserved"),
        ("steps" not in entity_texts, "single-word infra filtered"),
        ("sequences" not in entity_texts, "plural infra (lemma) filtered"),
        ("hidden states" not in entity_texts, "infra phrases filtered"),
        ("the model" not in entity_texts, "generic patterns filtered"),
    ]
    
    passed = sum(1 for check, _ in checks if check)
    print(f"\nValidation: {passed}/{len(checks)} passed")
    
    for check, description in checks:
        if not check:
            print(f"  FAIL: {description}")
    
    return passed == len(checks)

def test_normalize():
    print("Running test_normalize...")
    cases = [
        ("Scaled Dot-Product Attention", "scaled dot product attention"),
        ("machine translations", "machine translation"),
        ("Recurrent neural networks", "recurrent neural network"),
    ]
    
    passed = 0
    for original, expected in cases:
        result = normalize_text(original)
        if result == expected:
            passed += 1
        else:
            print(f"  FAIL: '{original}' -> '{result}' (expected '{expected}')")
    
    print(f"normalize_text: {passed}/{len(cases)} passed")

def test_fuzzy_matching():
    print("Running test_fuzzy_matching...")
    cases = [
        ("Recurrent neural networks", "recurrent network", "method", True),
        ("machine translation", "neural machine translation", "method", True),
        ("WMT 2014 English-German dataset", "WMT 2014 English-French dataset", "dataset", False),
        ("input sequence", "output sequences", "object", False),
    ]
    
    passed = 0
    for text1, text2, etype, expected in cases:
        entity1 = {"text": text1, "type": etype}
        entity2 = {"text": text2, "type": etype}
        result = should_merge(entity1, entity2)
        
        if result == expected:
            passed += 1
        else:
            print(f"  FAIL: '{text1}' vs '{text2}' -> {result} (expected {expected})")
    
    print(f"should_merge: {passed}/{len(cases)} passed")

In [4]:
print("Running tests...")
test_normalize()
test_fuzzy_matching()

Running tests...
Running test_normalize...
normalize_text: 3/3 passed
Running test_fuzzy_matching...
should_merge: 4/4 passed


In [5]:
arxiv_id = "1706.03762"
output_dir = Path("output")

all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"

print(f"Loading entities from: {all_entities_path}")
with open(all_entities_path) as f:
    all_entities = json.load(f)

print(f"Loading and processing full text from: {full_text_path}")
full_text = full_text_path.read_text()
doc = nlp(full_text)
print("spaCy Doc object created.")

Loading entities from: output/1706.03762_all_entities.json
Loading and processing full text from: output/1706.03762_full_text.txt
spaCy Doc object created.


In [6]:
config = FilterConfig(
    min_freq=1,
    exclude_other=True,
    use_fuzzy=True,
    top_k=50,
    window_size=10,
    pagerank_alpha=0.85
)

print(f"\nRunning pipeline with config: window_size={config.window_size}")

filtered_entities = filter_pipeline(
    all_entities, 
    doc,
    config=config
)


Running pipeline with config: window_size=10


In [7]:
print_results(filtered_entities)
validate_results(filtered_entities)

impact = analyze_impact(all_entities, filtered_entities)
print(f"\nReduction: {impact['reduction_pct']:.1f}% ({impact['original_count']} -> {impact['filtered_count']})")

Type distribution: {'method': 25, 'task': 15, 'metric': 3, 'dataset': 2, 'object': 2}

Top 20 entities:
   1. Transformer                              method   0.0781
   2. self-attention                           method   0.0592
   3. convolution                              method   0.0417
   4. language modeling                        task     0.0413
   5. attention mechanisms                     method   0.0364
   6. Multi-Head Attention                     method   0.0345
   7. transduction models                      task     0.0329
   8. RNs                                      method   0.0324
   9. encoder-decoder architectures            method   0.0308
  10. machine translation                      task     0.0266
  11. model architecture                       method   0.0257
  12. sequence transduction                    task     0.0244
  13. Recurrent neural networks                method   0.0233
  14. long short-term memory                   method   0.0233
  15. gated re