In [76]:
import ujson as json
from multiprocessing import Pool
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from s2ag_parser.schemas import *
from s2ag_parser.utils import *

In [None]:
n_papers = 5000
papers_path = "data/extracted/papers.jsonl"

sns.set_style("whitegrid")

In [None]:
def get_statistics(line: str) -> dict:
    paper = json.loads(line)

    sections_flat = get_sections_flat(paper)
    paragraphs = get_paragraphs_flat(paper)

    n_top_level_sections = len(paper["sections"])
    n_sections_flat = len(sections_flat)

    n_paragraphs = len(paragraphs)

    n_bibrefs = sum(bool(_["ref_type"] == "bibref" and _["ref_id"]) for p in paragraphs for _ in p["refs"])
    n_figurerefs = sum(bool(_["ref_type"] == "figureref" and _["ref_id"]) for p in paragraphs for _ in p["refs"])
    n_tablerefs = sum(bool(_["ref_type"] == "tableref" and _["ref_id"]) for p in paragraphs for _ in p["refs"])
    n_formula_refs = sum(bool(_["ref_type"] == "formularef" and _["ref_id"]) for p in paragraphs for _ in p["refs"])

    n_bib_items = len(paper["bibliography"])
    n_matched_bib_items = sum(
        isinstance(_.get("corpusid"), int) for _ in paper["bibliography"].values())
    n_figures = len(paper["figures"])
    n_tables = len(paper["tables"])
    n_formulas = len(paper["formulas"])

    text_len = sum(len(_["text"]) for _ in paragraphs)
    title_len = len(paper["title"]["text"]) if isinstance(paper["title"], dict) else 0
    abstract_len = len(paper["abstract"]["text"]) if isinstance(paper["abstract"], dict) else 0

    return {
        "corpusid": paper["corpusid"],
        "n_top_level_sections": n_top_level_sections,
        "n_sections_flat": n_sections_flat,
        "n_paragraphs": n_paragraphs,
        "n_bibrefs": n_bibrefs,
        "n_figurerefs": n_figurerefs,
        "n_tablerefs": n_tablerefs,
        "n_formula_refs": n_formula_refs,
        "n_bib_items": n_bib_items,
        "n_matched_bib_items": n_matched_bib_items,
        "n_figures": n_figures,
        "n_tables": n_tables,
        "n_formulas": n_formulas,
        "text_len": text_len,
        "title_len": title_len,
        "abstract_len": abstract_len,
    }

out_path = "data/paper_statistics.jsonl"
if False:
    with open(papers_path, "r") as f, open(out_path, "w") as f_out, Pool(20) as p:
        for stats in p.imap_unordered(get_statistics, tqdm(f, total=15496406), chunksize=1000):
            print(json.dumps(stats), file=f_out, flush=True)

In [72]:
all_paper_statistics = []
with open(out_path, "r") as f:
    for line in tqdm(f):
        all_paper_statistics.append(json.loads(line))

15496406it [00:27, 571212.33it/s]


In [92]:
n_papers = len(all_paper_statistics)
print(f"# Papers: {n_papers}")

# Papers: 15496406


In [98]:
temp = np.array([_["n_top_level_sections"] for _ in all_paper_statistics])
for n_sections in [0, 5, 10, 100]:
    n = sum(_ > n_sections for _ in temp)
    print(f"# Papers with >{n_sections:3} top level sections: {n:8} ({n/n_papers:6.2%})")

# Papers with >  0 top level sections: 15077212 (97.29%)
# Papers with >  5 top level sections: 10384488 (67.01%)
# Papers with > 10 top level sections:  5522843 (35.64%)
# Papers with >100 top level sections:     7546 ( 0.05%)


In [99]:
temp = np.array([_["n_sections_flat"] for _ in all_paper_statistics])
for n_sections in [0, 5, 10, 100]:
    n = sum(_ > n_sections for _ in temp)
    print(f"# Papers with >{n_sections:3} sections (flat): {n:8} ({n/n_papers:6.2%})")

# Papers with >  0 sections (flat): 15077212 (97.29%)
# Papers with >  5 sections (flat): 11544882 (74.50%)
# Papers with > 10 sections (flat):  7669514 (49.49%)
# Papers with >100 sections (flat):     9666 ( 0.06%)


In [100]:
temp = np.array([_["n_paragraphs"] for _ in all_paper_statistics])
for n_paragraphs in [0, 5, 10, 100]:
    n = sum(_ > n_paragraphs for _ in temp)
    print(f"# Papers with >{n_paragraphs:3} paragraphs: {n:8} ({n/n_papers:6.2%})")

# Papers with >  0 paragraphs: 15055696 (97.16%)
# Papers with >  5 paragraphs: 14303013 (92.30%)
# Papers with > 10 paragraphs: 13321738 (85.97%)
# Papers with >100 paragraphs:   878277 ( 5.67%)
