In [1]:
from __future__ import annotations
from pydantic import BaseModel, Field

class AnnotationModel(BaseModel):
    start: int | None
    end: int | None

    model_config = {"extra": "allow"}

class SectionModel(BaseModel):
    n: str | None
    header: AnnotationModel
    
    sections: list[SectionModel] = Field(default_factory=list)
    paragraphs: list[AnnotationModel] = Field(default_factory=list)

class PaperModel(BaseModel):
    corpusid: int
    externalids: dict[str, int | str | None]
    
    source: dict
    text: str
    annotations: dict[str, list[AnnotationModel]]

    title: str
    abstract: str
    sections: list[SectionModel] = Field(default_factory=list)
    references: list[str]
    figures: list[str]
    tables: list[str]

def get_text_span(span: dict, text: str = None, paper: PaperModel = None) -> str:
    if paper:
        return paper["text"][span["start"]:span["end"]]
    return text[span["start"]:span["end"]]

def get_text_spans(annotations: list[dict], text: str = None, paper: dict = None) -> list[str]:
    return [get_text_span(_, text=text, paper=paper) for _ in annotations]

In [2]:
import ast

NONSECTION_KEYS = [
    "title", "abstract",
    "publisher", "venue",
    "author", "authoraffiliation", "authorfirstname", "authorlastname",
    "bibauthor", "bibauthorfirstname", "bibauthorlastname", "bibentry", "bibtitle", "bibvenue",
    "figure", "figurecaption",
    "table",
]

def sanitize_annotations(annotations: dict[str, list[AnnotationModel]]) -> dict[str, list[AnnotationModel]]:
    for key, _annotations in annotations.items():
        # ensure annotations is always a list
        if _annotations is None:
            annotations[key] = []
            continue

        # literal eval all annotations for easier access later on
        try:
            temp = ast.literal_eval(_annotations)
            assert isinstance(temp, list)

            if isinstance(temp, dict):
                    temp["start"], temp["end"] = int(temp["start"]), int(temp["end"])
            elif isinstance(temp, list):
                for _ in temp:
                    _["start"], _["end"] = int(_["start"]), int(_["end"])

            annotations[key] = temp
        except:
            pass

        # deduplicate and keep valid annotations only, i.e. where start < end
        try:
            temp = []
            seen_annotation_idxs = set()
            for annotation in annotations:
                annotation_idxs = (annotation["start"], annotation["end"])
                if annotation_idxs not in seen_annotation_idxs and annotation_idxs[0] < annotation_idxs[1]:
                    temp.append(annotation)
                    seen_annotation_idxs.add(annotation_idxs)

            temp.sort(key=lambda _: _["start"])
            annotations[key] = temp
        except:
            pass

    return annotations

def assign_paragraphs_to_sections(paper: dict) -> list[SectionModel]:
    # get important annotations for determining sectioning
    sectionheader_annotations = paper["annotations"]["sectionheader"]
    paragraph_annotations = paper["annotations"]["paragraph"]

    # prepare list of sections based on sectionheaders
    sections = []
    for ann in sectionheader_annotations:
        temp = {
            "n": ann.get("attributes", {}).get("n"),
            "header": {
                "start": ann["start"],
                "end": ann["end"],
                "text": paper["text"][ann["start"]:ann["end"]],
            },
            "sections": [],
            "paragraphs": [],
        }

        # if previous section has exact same header text, ignore
        if not sections or temp["header"]["text"] != sections[-1]["header"]["text"]:
            sections.append(temp)

    # prepare list of paragraphs
    paragraphs = []
    for ann in paragraph_annotations:
        temp = ann.copy()
        temp["text"] = paper["text"][ann["start"]:ann["end"]]

        # if previous paragraph has exact same text, ignore
        if not paragraphs or temp["text"] != paragraphs[-1]["text"]:
            paragraphs.append(temp)

    # assign paragraphs to sections
    dummy_section = {
        "n": None,
        "header": None,
        "sections": [],
        "paragraphs": [],
    }
    for paragraph in paragraphs:
        # add paragraphs that end before first section header should to new dummy first section
        if not sections or paragraph["end"] < sections[0]["header"]["start"]:
            dummy_section["paragraphs"].append(paragraph)

        # add paragraph to most recent section
        else:
            parent_section = None
            for section in sections:
                if section["header"]["end"] < paragraph["start"]:
                    parent_section = section
                else:
                    break
            if parent_section:
                parent_section["paragraphs"].append(paragraph)
    
    # establish dummy section as new first section, if it has paragraphs
    if dummy_section["paragraphs"]:
        sections.insert(0, dummy_section)

    return sections

def remove_nonsection_content(paper: dict, sections: list[SectionModel]) -> list[SectionModel]:
    # determine also all annotations that are not part of the the paper's sections
    nonsection_annotations = [
        _ for key in NONSECTION_KEYS for _ in paper["annotations"][key]
    ]
    nonsection_annotations.sort(key=lambda _: _["start"])

    # merge for efficiency in checking later on
    if len(nonsection_annotations) > 1:
        temp = [nonsection_annotations[0]]
        for _ in nonsection_annotations[1:]:
            prev = temp[-1]
            if _["start"] <= prev["end"]:
                prev["end"] = max(prev["end"], _["end"])
            else:
                temp.append(_)
        nonsection_annotations = temp
    
    # remove sections that have nonsection content
    new_sections = []
    for section in sections:
        if section["header"]:
            start = section["header"]["start"]
        else:
            if section["paragraphs"]:
                start = section["paragraphs"][0]["start"]
            else:
                start = None

        end = None
        if section["header"]:
            end = section["header"]["end"]
        if section["paragraphs"]:
            end = section["paragraphs"][-1]["end"]

        if not any(
            ann["start"] <= end and start <= ann["end"]
            for ann in nonsection_annotations
        ):
            new_sections.append(section)

    return new_sections

def nest_sections(sections: list[SectionModel]) -> list[SectionModel]:
    nested_sections = []

    # use section numbering 
    if any(section["n"] for section in sections):
        # keep track of current section nesting in stack
        stack = []
        for section in sections:
            current_n = section["n"]

            if current_n:
                # pop from stack until we find a parent whose n is a prefix
                while stack:
                    parent_n = stack[-1]["n"]
                    if parent_n and current_n.startswith(parent_n):
                        break
                    stack.pop()

                if stack:
                    # current section is child of previous section in stack
                    stack[-1]["sections"].append(section)
                else:
                    # current section is new top-level section
                    nested_sections.append(section)

                # add current section to stack
                stack.append(section)
            else:
                # unnumbered section: attach to most recent section in stack if any
                if stack:
                    stack[-1]["sections"].append(section)
                else:
                    # no parent section, treat as top-level
                    nested_sections.append(section)

    # FUTUREWORK: nest based on IMRAD heuristics instead # FUTUREWORK: use LLM to perform nesting instead
    else:
        nested_sections = sections
        
    return nested_sections

def clean_paper(paper: dict) -> PaperModel:

    # reorganise some of the keys, for easier access
    for key, val in paper["content"].items():
        paper[key] = val
    del paper["content"]

    paper["annotations"] = sanitize_annotations(paper["annotations"])

    sections = assign_paragraphs_to_sections(paper)
    sections = remove_nonsection_content(paper, sections)
    sections = nest_sections(sections)
    paper["sections"] = sections

    return paper

In [None]:
# assign citations to paragraphs
citation_annotations = paper["annotations"]["bibref"]
paragraph_annotations = paper["annotations"]["paragraph"]

# prepare list of paragraphs
paragraphs = []
for ann in paragraph_annotations:
    temp = ann.copy()
    temp["text"] = paper["text"][ann["start"]:ann["end"]]

    # if previous paragraph has exact same text, ignore
    if not paragraphs or temp["text"] != paragraphs[-1]["text"]:
        paragraphs.append(temp)

for paragraph in paragraphs:
    # add paragraphs that end before first section header should to new dummy first section
    if not sections or paragraph["end"] < sections[0]["header"]["start"]:
        dummy_section["paragraphs"].append(paragraph)

    # add paragraph to most recent section
    else:
        parent_section = None
        for section in sections:
            if section["header"]["end"] < paragraph["start"]:
                parent_section = section
            else:
                break
        if parent_section:
            parent_section["paragraphs"].append(paragraph)

for ann in paper["annotations"]["bibref"]:
    print(paper["text"][ann["start"]:ann["end"]])

[1]
[1]
[2]
[3]
[4]
[5]


In [6]:
paper["sections"][0]

{'n': '1',
 'header': {'start': 1640, 'end': 1652, 'text': 'Introduction'},
 'sections': [],
 'paragraphs': [{'end': 2340,
   'start': 1654,
   'text': "With the continuous decline of environmental quality and the continuous improvement of people's living standards, people's requirements for outdoor thermal environment are also getting higher and higher. The main meteorological parameters that affect the thermal comfort of the human body in the outdoor environment are temperature, wind speed, humidity, and solar radiation. At the same time, compared with the more stable indoor thermal environment, these outdoor meteorological environments have a wider range of changes and more severe fluctuations [1]. Therefore, the outdoor thermal environment will have a more severe impact on the people's thermal sensation and thermal comfort."},
  {'end': 3368,
   'start': 2342,
   'text': "In fact, domestic scholars and abroad have done a lot of research on thermal comfort. Research by Nikolopoulou 

In [3]:
import glob
import json

filepaths = list(glob.glob("data/2*"))

papers = []
stop = False
for filepath in filepaths[:1]:
    with open(filepath, "r") as f:
        for line in f:
            raw_paper = json.loads(line)
            paper = clean_paper(raw_paper)
            papers.append(paper)

            if len(papers) == 100:
                stop = True
                break

    if stop:
        break