In [2]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path modified: {str(project_root) in sys.path}")

try:
    import client
    import config
    import models
    import parsers
    import nlp
    print("All project modules are now accessible")
except ModuleNotFoundError as e:
    print(f"Module import failed: {e}")
    raise

Project root: /Users/kat/Desktop/pinned/ginkgo
Python path modified: True
All project modules are now accessible


In [4]:
import sys
from pathlib import Path
import json
from collections import Counter

from client.arxiv import ArXivClient
from client.grobid import GROBIDClient
from models.grobid import Form, File
from parsers.tei import Parser
from config.llm import LangExtractConfig, GeminiConfig
from config.nlp import NLPConfig
from nlp.structural import SectionProcessor
from nlp.semantic import SemanticExtractor
from nlp.syntactic import parse
from utils.clean_text import preprocess_section

from nlp.entity_filter import (
    normalize_text,
    should_merge,
    filter_pipeline,
    analyze_impact,
    nlp,
    FilterConfig
)

In [5]:
arxiv_id = "1810.04805"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [6]:
arxiv_client = ArXivClient()
metadata = arxiv_client.get_metadata(arxiv_id)
pdf_path = output_dir / f"{arxiv_id}.pdf"
arxiv_client.download_pdf(arxiv_id, str(pdf_path))

In [7]:
grobid_client = GROBIDClient()
with open(pdf_path, "rb") as f:
    pdf_bytes = f.read()

form = Form(
    file=File(payload=pdf_bytes, file_name=f"{arxiv_id}.pdf"),
    consolidate_citations=1,
    consolidate_header=1,
    segment_sentences=True
)

response = grobid_client.process_pdf(form)
tei_path = output_dir / f"{arxiv_id}.tei.xml"
tei_path.write_bytes(response.content)

122311

In [8]:
parser = Parser(response.content)
article = parser.parse()

In [9]:
sections_data = []
for section in article.sections:
    section_text = ""
    for paragraph in section.paragraphs:
        section_text += paragraph.plain_text + " "
    
    clean_text = preprocess_section(section_text.strip())
    sections_data.append({
        "title": section.title,
        "raw_text": section_text.strip(),
        "clean_text": clean_text
    })

In [10]:
langextract_config = LangExtractConfig()
gemini_config = GeminiConfig()
nlp_config = NLPConfig()

In [11]:
import langextract as lx
from llm.prompts.langextract import PROMPT, EXAMPLES
from config.nlp import normalize_section

all_entities = []

for section_data in sections_data:
    normalized_title = normalize_section(section_data["title"], nlp_config.patterns)
    section_config = nlp_config.sections.get(normalized_title, nlp_config.sections["default"])
    
    result = lx.extract(
        text_or_documents=section_data["clean_text"],
        prompt_description=PROMPT,
        examples=EXAMPLES,
        model_id=langextract_config.model_id,
        api_key=langextract_config.api_key,
        extraction_passes=section_config.extraction_passes,
        max_workers=langextract_config.max_workers,
        max_char_buffer=section_config.max_char_buffer,
    )
    
    section_entities = []
    for extraction in result.extractions:
        entity_dict = {
            "text": extraction.extraction_text,
            "type": extraction.extraction_class,
            "char_interval": (
                {
                    "start_pos": extraction.char_interval.start_pos,
                    "end_pos": extraction.char_interval.end_pos,
                }
                if extraction.char_interval
                else None
            ),
            "section": section_data["title"]
        }
        section_entities.append(entity_dict)
    
    title = section_data["title"].replace(" ", "_").replace("/", "_")
    section_json = json.dumps(section_entities, indent=2)
    section_path = output_dir / f"{arxiv_id}_{title}_entities.json"
    section_path.write_text(section_json)
    
    all_entities.extend(section_entities)

In [12]:
full_clean_text = "\n\n".join(s["clean_text"] for s in sections_data)
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"
full_text_path.write_text(full_clean_text)

all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
all_entities_json = json.dumps(all_entities, indent=2)
all_entities_path.write_text(all_entities_json)

print(f"Full text saved to: {full_text_path}")
print(f"All entities saved to: {all_entities_path}")

Full text saved to: output/1810.04805_full_text.txt
All entities saved to: output/1810.04805_all_entities.json


In [13]:
arxiv_id = "1810.04805"
output_dir = Path("output")

all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"

print(f"Loading entities from: {all_entities_path}")
with open(all_entities_path) as f:
    all_entities = json.load(f)

print(f"Loading and processing full text from: {full_text_path}")
full_text = full_text_path.read_text()
doc = nlp(full_text)
print("spaCy Doc object created.")

Loading entities from: output/1810.04805_all_entities.json
Loading and processing full text from: output/1810.04805_full_text.txt
spaCy Doc object created.


In [14]:
config = FilterConfig(
    min_freq=1,
    exclude_other=True,
    use_fuzzy=True,
    top_k=50,
    window_size=10,
    pagerank_alpha=0.85
)

print(f"\nRunning pipeline with config: window_size={config.window_size}")

filtered_entities = filter_pipeline(
    all_entities, 
    doc,
    config=config
)


Running pipeline with config: window_size=10


In [15]:
def print_results(entities: list[dict], top_n: int = 50):
    type_dist = Counter(e["type"] for e in entities)
    print(f"Type distribution: {dict(type_dist)}\n")
    
    print(f"Top {min(top_n, len(entities))} entities:")
    for i, e in enumerate(entities[:top_n], 1):
        score = e.get("pr_score", 0)
        print(f"  {i:2d}. {e['text']:<40} {e['type']:<8} {score:.4f}")

In [16]:
print_results(filtered_entities)

Type distribution: {'method': 11, 'task': 21, 'dataset': 17, 'object': 1}

Top 50 entities:
   1. BERT                                     method   0.0691
   2. fine-tuning                              task     0.0556
   3. pre-training                             task     0.0398
   4. NER                                      method   0.0352
   5. NLI                                      task     0.0272
   6. question answering                       task     0.0253
   7. ELMo                                     method   0.0232
   8. GLUE                                     dataset  0.0210
   9. training data                            dataset  0.0199
  10. next sentence prediction                 task     0.0197
  11. MLM                                      method   0.0194
  12. MNLI                                     dataset  0.0158
  13. MRPC                                     task     0.0132
  14. RTL                                      method   0.0127
  15. entailment          

In [17]:
impact = analyze_impact(all_entities, filtered_entities)
print(f"\nReduction: {impact['reduction_pct']:.1f}% ({impact['original_count']} -> {impact['filtered_count']})")


Reduction: 92.1% (631 -> 50)
