In [1]:
from bertopic import BERTopic
from helpers import read_jsonl, meta_path, texts_path, Publication, ROOT

In [2]:
def retrieve_texts(data: Publication, field: str = "body_text") -> str:
    """Parse 'body_text' or 'abstract' fields extracting raw texts."""
    return {
        "id": data["paper_id"],
        "text": " ".join(section["text"] for section in data[field])
    }

In [3]:
data = [retrieve_texts(paper) for paper in read_jsonl(texts_path)]

In [4]:
ids = {entry["id"] for entry in data}

In [15]:
metadata = []
for paper in read_jsonl(meta_path):
    if paper["paper_id"] in ids and paper["year"] is not None:
        if paper["year"] >= 1991:
            metadata.append({
                "id": paper["paper_id"],
                "year": paper["year"]
            })

In [16]:
len(metadata)

9145

In [18]:
timestamps = [paper["year"] for paper in metadata]
texts = [paper["text"] for paper in data if paper["id"] in {i["id"] for i in metadata}]

In [20]:
assert len(texts) == len(timestamps)

In [21]:
len(texts)

9145

In [23]:
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(texts)

Batches:   0%|          | 0/286 [00:00<?, ?it/s]

2021-08-16 22:31:36,658 - BERTopic - Transformed documents to Embeddings
2021-08-16 22:31:50,574 - BERTopic - Reduced dimensionality with UMAP
2021-08-16 22:31:51,674 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [24]:
topics_over_time = topic_model.topics_over_time(texts, topics, timestamps, nr_bins=20)

10it [08:35, 51.50s/it]


In [39]:
topics_over_time.to_csv(ROOT / "data" / "processed" / "bertopic-dist.csv", index=False)