In [1]:
from bertopic import BERTopic
from helpers import read_jsonl, meta_path, texts_path, Publication, ROOT

In [2]:
def retrieve_texts(data: Publication, field: str = "body_text") -> str:
    """Parse 'body_text' or 'abstract' fields extracting raw texts."""
    return {
        "id": data["paper_id"],
        "text": " ".join(section["text"] for section in data[field])
    }

In [3]:
data = [retrieve_texts(paper) for paper in read_jsonl(texts_path)]

In [4]:
ids = {entry["id"] for entry in data}

In [5]:
metadata = []
for paper in read_jsonl(meta_path):
    if paper["paper_id"] in ids and paper["year"] is not None:
        if paper["year"] >= 1991:
            metadata.append({
                "id": paper["paper_id"],
                "year": paper["year"]
            })

In [6]:
len(metadata)

9145

In [7]:
timestamps = [paper["year"] for paper in metadata]
texts = [paper["text"] for paper in data if paper["id"] in {i["id"] for i in metadata}]

In [8]:
assert len(texts) == len(timestamps)

# TM

In [11]:
topic_model = BERTopic(verbose=True, low_memory=True, calculate_probabilities=False)
topics, probs = topic_model.fit_transform(texts)

Batches:   0%|          | 0/286 [00:00<?, ?it/s]

2021-08-18 14:32:57,181 - BERTopic - Transformed documents to Embeddings
2021-08-18 14:33:42,231 - BERTopic - Reduced dimensionality with UMAP
2021-08-18 14:33:43,440 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [12]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,3729,-1_we_were_can_social
1,0,321,0_law_court_legal_courts
2,1,292,1_eu_european_union_eus
3,2,266,2_migration_migrants_immigration_immigrants
4,3,189,3_african_africa_government_political


In [14]:
freq

Unnamed: 0,Topic,Count,Name
0,-1,3729,-1_we_were_can_social
1,0,321,0_law_court_legal_courts
2,1,292,1_eu_european_union_eus
3,2,266,2_migration_migrants_immigration_immigrants
4,3,189,3_african_africa_government_political
...,...,...,...
125,124,12,124_institutional_institutions_organizations_o...
126,125,12,125_un_civilizations_nations_international
127,126,11,126_piracy_maritime_pirates_ship
128,127,10,127_regulatory_regulators_regulation_rules


In [23]:
topic_model.save("../models/210818-BERT.model")

In [24]:
freq.to_csv("../data/processed/210818-bert-freq-static.csv", index=False)

# DTM

In [27]:
topics_over_time = topic_model.topics_over_time(texts, topics, timestamps)

30it [14:16, 28.54s/it]


In [28]:
assert len(topics_over_time["Topic"].unique()) == len(freq["Topic"].unique())

In [29]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"gongos, euthanasia, autonomy, ngos, agents",7,1991
1,0,"law, jurisdiction, laws, court, rule",3,1991
2,4,"china, chinas, chinese, economic, li",1,1991
3,11,"women, gender, womens, islamic, female",1,1991
4,13,"civic, technologies, users, citizen, technology",1,1991
...,...,...,...,...
1798,105,"patent, patents, estonia, estonian, smes",1,2020
1799,110,"financial, banking, banks, economic, european",1,2020
1800,111,"director, company, directors, dsc, companys",1,2020
1801,126,"piracy, danish, denmark, maritime, pirates",1,2020


In [30]:
topics_over_time.to_csv(ROOT / "data" / "processed" / "210818-bert-freq-dtm.csv", index=False)