In [1]:
import sys
from pathlib import Path

if str(Path.cwd().parent) not in sys.path:
    sys.path.insert(0, str(Path.cwd().parent))

import json
from client.arxiv import ArXivClient
from client.grobid import GROBIDClient
from models.grobid import Form, File
from parsers.tei import Parser
from utils.clean_text import preprocess_section
from nlp.semantic import EntityExtractor
from config.llm import LangExtractConfig
from config.nlp import NLPConfig, normalize_section
from nlp.syntactic import nlp

In [2]:
arxiv_id = "1810.04805"
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

langextract_config = LangExtractConfig()
nlp_config = NLPConfig()

In [3]:
arxiv_client = ArXivClient()
metadata = arxiv_client.get_metadata(arxiv_id)
pdf_path = output_dir / f"{arxiv_id}.pdf"
arxiv_client.download_pdf(arxiv_id, str(pdf_path))

print(f"PDF downloaded to: {pdf_path}")
print(f"File size: {pdf_path.stat().st_size / 1024 / 1024:.2f} MB")


PDF downloaded to: output/1810.04805.pdf
File size: 0.74 MB


In [5]:
grobid_client = GROBIDClient()
pdf_bytes = pdf_path.read_bytes()

form = Form(
    file=File(payload=pdf_bytes, file_name=f"{arxiv_id}.pdf"),
    consolidate_citations=1,
    consolidate_header=1,
    segment_sentences=True
)

response = grobid_client.process_pdf(form)
tei_path = output_dir / f"{arxiv_id}.tei.xml"
tei_path.write_bytes(response.content)

print(f"GROBID processing complete")
print(f"TEI XML saved to: {tei_path}")


GROBID processing complete
TEI XML saved to: output/1810.04805.tei.xml


In [6]:
parser = Parser(response.content)
article = parser.parse()

print(f"Sections parsed: {len(article.sections)}")
for section in article.sections[:5]:
    print(f"  - {section.title}")


Sections parsed: 27
  - Introduction
  - Related Work
  - Unsupervised Feature-based Approaches
  - Unsupervised Fine-tuning Approaches
  - Transfer Learning from Supervised Data


In [7]:
sections_data = []

for section in article.sections:
    section_text = " ".join(p.plain_text for p in section.paragraphs)
    clean_text = preprocess_section(section_text.strip())
    
    sections_data.append({
        "title": section.title,
        "raw_text": section_text.strip(),
        "clean_text": clean_text
    })

print(f"Sections extracted: {len(sections_data)}")


Sections extracted: 27


In [8]:
extractor = EntityExtractor(langextract_config)
all_entities = []

for section_data in sections_data:
    normalized_title = normalize_section(section_data["title"], nlp_config.patterns)
    section_config = nlp_config.sections.get(normalized_title, nlp_config.sections["default"])
    
    section_doc = nlp(section_data["clean_text"])
    section_entities = extractor.extract(section_data["clean_text"], section_config)
    entities_with_context = extractor.convert_to_spans(section_entities, section_doc, context_size=1)
    
    serializable_entities = []
    for entity in entities_with_context:
        entity_copy = dict(entity)
        del entity_copy["span"]
        entity_copy["section"] = section_data["title"]
        serializable_entities.append(entity_copy)
    
    all_entities.extend(serializable_entities)

print(f"Total entities extracted: {len(all_entities)}")


Total entities extracted: 459


In [9]:
for i, section_data in enumerate(sections_data):
    section_title = section_data["title"].replace(" ", "_").replace("/", "_")
    section_path = output_dir / f"{arxiv_id}_{section_title}_entities.json"
    
    section_entities = [e for e in all_entities if e.get("section") == section_data["title"]]
    section_path.write_text(json.dumps(section_entities, indent=2))

print(f"Section entity files saved")

Section entity files saved


In [10]:
full_clean_text = "\n\n".join(s["clean_text"] for s in sections_data)
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"
full_text_path.write_text(full_clean_text)

print(f"Full text saved to: {full_text_path}")
print(f"Character count: {len(full_clean_text):,}")

Full text saved to: output/1810.04805_full_text.txt
Character count: 37,651


In [11]:
all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
all_entities_path.write_text(json.dumps(all_entities, indent=2))

print(f"All entities saved to: {all_entities_path}")
print(f"Total entities: {len(all_entities)}")

type_counts = {}
for entity in all_entities:
    entity_type = entity.get("type", "unknown")
    type_counts[entity_type] = type_counts.get(entity_type, 0) + 1

print("Entity type distribution:")
for entity_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
    print(f"  {entity_type}: {count}")


All entities saved to: output/1810.04805_all_entities.json
Total entities: 459
Entity type distribution:
  other: 200
  dataset: 88
  task: 80
  method: 76
  metric: 15


In [12]:
all_entities_path = output_dir / f"{arxiv_id}_all_entities.json"
full_text_path = output_dir / f"{arxiv_id}_full_text.txt"

with open(all_entities_path) as f:
    all_entities = json.load(f)

full_text = full_text_path.read_text()
doc = nlp(full_text)

print(f"Entities loaded: {len(all_entities)}")
print(f"Full text characters: {len(full_text):,}")
print(f"spaCy Doc tokens: {len(doc)}")


Entities loaded: 459
Full text characters: 37,651
spaCy Doc tokens: 7300
