In [1]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path modified: {str(project_root) in sys.path}")

try:
    import client
    import config
    import models
    import parsers
    import nlp
    print("All project modules are now accessible")
except ModuleNotFoundError as e:
    print(f"Module import failed: {e}")
    raise

Project root: /Users/kat/Desktop/pinned/ginkgo
Python path modified: True
All project modules are now accessible


In [2]:
import sys
from pathlib import Path
import json
from collections import Counter

from client.arxiv import ArXivClient
from client.grobid import GROBIDClient
from models.grobid import Form, File
from parsers.tei import Parser
from config.llm import LangExtractConfig, GeminiConfig
from config.nlp import NLPConfig
from nlp.structural import SectionProcessor
from nlp.semantic import EntityExtractor
from nlp.syntactic import parse
from utils.clean_text import preprocess_section

from nlp.entity_filter import (
    normalize_text,
    should_merge,
    filter_pipeline,
    analyze_impact,
    nlp,
    FilterConfig
)

In [3]:
arxiv_id = "1810.04805"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [4]:
arxiv_client = ArXivClient()
metadata = arxiv_client.get_metadata(arxiv_id)
pdf_path = output_dir / f"{arxiv_id}.pdf"
arxiv_client.download_pdf(arxiv_id, str(pdf_path))

In [5]:
grobid_client = GROBIDClient()
with open(pdf_path, "rb") as f:
    pdf_bytes = f.read()

form = Form(
    file=File(payload=pdf_bytes, file_name=f"{arxiv_id}.pdf"),
    consolidate_citations=1,
    consolidate_header=1,
    segment_sentences=True
)

response = grobid_client.process_pdf(form)
tei_path = output_dir / f"{arxiv_id}.tei.xml"
tei_path.write_bytes(response.content)

122604

In [6]:
parser = Parser(response.content)
article = parser.parse()

In [7]:
sections_data = []
for section in article.sections:
    section_text = ""
    for paragraph in section.paragraphs:
        section_text += paragraph.plain_text + " "
    
    clean_text = preprocess_section(section_text.strip())
    sections_data.append({
        "title": section.title,
        "raw_text": section_text.strip(),
        "clean_text": clean_text
    })

In [8]:
langextract_config = LangExtractConfig()
gemini_config = GeminiConfig()
nlp_config = NLPConfig()

In [None]:
from config.nlp import normalize_section

extractor = EntityExtractor(langextract_config)
all_entities = []

for section_data in sections_data:
    normalized_title = normalize_section(section_data["title"], nlp_config.patterns)
    section_config = nlp_config.sections.get(normalized_title, nlp_config.sections["default"])
    
    section_doc = nlp(section_data["clean_text"])
    
    section_entities = extractor.extract(section_data["clean_text"], section_config)
    
    entities_with_context = extractor.convert_to_spans(section_entities, section_doc, context_size=2)
    
    serializable_entities = []
    for entity in entities_with_context:
        entity_copy = dict(entity)
        del entity_copy["span"]
        entity_copy["section"] = section_data["title"]
        serializable_entities.append(entity_copy)
    
    title = section_data["title"].replace(" ", "_").replace("/", "_")
    section_path = output_dir / f"{arxiv_id}_{title}_entities.json"
    section_path.write_text(json.dumps(serializable_entities, indent=2))
    
    all_entities.extend(serializable_entities)

In [10]:
full_clean_text = "\n\n".join(s["clean_text"] for s in sections_data)
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"
full_text_path.write_text(full_clean_text)

all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
all_entities_json = json.dumps(all_entities, indent=2)
all_entities_path.write_text(all_entities_json)

print(f"Full text saved to: {full_text_path}")
print(f"All entities saved to: {all_entities_path}")

Full text saved to: output/1810.04805_full_text.txt
All entities saved to: output/1810.04805_all_entities.json


In [11]:
arxiv_id = "1810.04805"
output_dir = Path("output")

all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"

print(f"Loading entities from: {all_entities_path}")
with open(all_entities_path) as f:
    all_entities = json.load(f)

print(f"Loading and processing full text from: {full_text_path}")
full_text = full_text_path.read_text()
doc = nlp(full_text)
print("spaCy Doc object created.")

Loading entities from: output/1810.04805_all_entities.json
Loading and processing full text from: output/1810.04805_full_text.txt
spaCy Doc object created.


In [12]:
extractor = EntityExtractor(langextract_config)
entities_with_spans = extractor.convert_to_spans(all_entities, doc, context_size=2)

print(f"Total entities: {len(all_entities)}")
print(f"Entities with valid spans: {len(entities_with_spans)}")

serializable_entities = []
for entity in entities_with_spans:
    entity_copy = dict(entity)
    entity_copy["span_text"] = entity_copy["span"].text
    del entity_copy["span"]
    serializable_entities.append(entity_copy)

entities_with_spans_path = output_dir / f"{arxiv_id}_entities_with_context.json"
with open(entities_with_spans_path, "w") as f:
    json.dump(serializable_entities, f, indent=2)

print(f"Entities with context saved to: {entities_with_spans_path}")

Total entities: 459
Entities with valid spans: 459
Entities with context saved to: output/1810.04805_entities_with_context.json


In [13]:
config = FilterConfig(
    min_freq=1,
    exclude_other=True,
    use_fuzzy=True,
    top_k=50,
    window_size=10,
    pagerank_alpha=0.85
)

print(f"\nRunning pipeline with config: window_size={config.window_size}")

filtered_entities = filter_pipeline(
    all_entities, 
    doc,
    config=config
)


Running pipeline with config: window_size=10


In [14]:
def print_results(entities: list[dict], top_n: int = 50):
    type_dist = Counter(e["type"] for e in entities)
    print(f"Type distribution: {dict(type_dist)}\n")
    
    print(f"Top {min(top_n, len(entities))} entities:")
    for i, e in enumerate(entities[:top_n], 1):
        score = e.get("pr_score", 0)
        print(f"  {i:2d}. {e['text']:<40} {e['type']:<8} {score:.4f}")

In [15]:
print_results(filtered_entities)

Type distribution: {'method': 10, 'task': 15, 'dataset': 25}

Top 50 entities:
   1. BERT                                     method   0.0677
   2. fine-tuning                              task     0.0598
   3. Open AI GPT                              method   0.0465
   4. NER                                      method   0.0358
   5. question answering                       task     0.0291
   6. NLI                                      task     0.0270
   7. classification                           task     0.0248
   8. training data                            dataset  0.0230
   9. ELMo                                     method   0.0226
  10. MNLI                                     dataset  0.0200
  11. SQuAD                                    dataset  0.0177
  12. next sentence prediction                 task     0.0155
  13. MRPC                                     dataset  0.0149
  14. Masked LM                                method   0.0141
  15. named entity recognition         

In [16]:
impact = analyze_impact(all_entities, filtered_entities)
print(f"\nReduction: {impact['reduction_pct']:.1f}% ({impact['original_count']} -> {impact['filtered_count']})")


Reduction: 89.1% (459 -> 50)
