In [1]:
from bertopic import BERTopic
from helpers import read_jsonl, meta_path, texts_path, Publication, ROOT

In [2]:
def retrieve_texts(data: Publication, field: str = "body_text") -> str:
    """Parse 'body_text' or 'abstract' fields extracting raw texts."""
    return {
        "id": data["paper_id"],
        "text": " ".join(section["text"] for section in data[field])
    }

In [3]:
data = [retrieve_texts(paper) for paper in read_jsonl(texts_path)]

In [4]:
ids = {entry["id"] for entry in data}

In [5]:
metadata = []
for paper in read_jsonl(meta_path):
    if paper["paper_id"] in ids and paper["year"] is not None:
        if paper["year"] >= 1991:
            metadata.append({
                "id": paper["paper_id"],
                "year": paper["year"]
            })

In [6]:
timestamps = [paper["year"] for paper in metadata]
texts = [paper["text"] for paper in data if paper["id"] in {i["id"] for i in metadata}]

In [7]:
assert len(texts) == len(timestamps)

# TM

In [8]:
topic_model = BERTopic(verbose=True, low_memory=True, calculate_probabilities=False)
topics, probs = topic_model.fit_transform(texts)

Batches:   0%|          | 0/286 [00:00<?, ?it/s]

2021-08-18 17:29:05,818 - BERTopic - Transformed documents to Embeddings
2021-08-18 17:29:52,577 - BERTopic - Reduced dimensionality with UMAP
2021-08-18 17:29:53,765 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [9]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,3460,-1_we_were_can_social
1,0,317,0_media_news_internet_information
2,1,289,1_eu_european_union_eus
3,2,268,2_migration_migrants_immigration_immigrants
4,3,266,3_law_court_legal_courts


In [10]:
freq

Unnamed: 0,Topic,Count,Name
0,-1,3460,-1_we_were_can_social
1,0,317,0_media_news_internet_information
2,1,289,1_eu_european_union_eus
3,2,268,2_migration_migrants_immigration_immigrants
4,3,266,3_law_court_legal_courts
...,...,...,...
124,123,11,123_attack_cyberspace_cyberwar_attacks
125,124,11,124_regulatory_regulators_legitimacy_regulation
126,125,10,125_europeanization_europe_european_hungarian
127,126,10,126_chinese_students_socialist_xi


In [11]:
topic_model.save("../models/210818-BERT.model")

In [12]:
freq.to_csv("../data/processed/210818-bert-freq-static.csv", index=False)

# DTM

In [13]:
topics_over_time = topic_model.topics_over_time(texts, topics, timestamps)

30it [14:02, 28.09s/it]


In [14]:
assert len(topics_over_time["Topic"].unique()) == len(freq["Topic"].unique())

In [15]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"gongos, medical, euthanasia, autonomy, ngos, g...",7,1991
1,0,"civic, technologies, citizen, digital, kenya, ...",1,1991
2,3,"law, jurisdiction, laws, court, rule, legal, j...",3,1991
3,4,"welfare, governance, labours, citizenship, gov...",1,1991
4,6,"china, chinas, chinese, economic, li, governme...",1,1991
...,...,...,...,...
1795,103,"school, schools, teachers, society, teachingle...",1,2020
1796,110,"patent, patents, estonia, estonian, smes, inno...",1,2020
1797,115,"chavez, ecuador, venezuela, bolivia, morales, ...",1,2020
1798,117,"humanities, university, students, interdiscipl...",1,2020


# Assigning weights

In [18]:
# weight_per_topic = groups by topic, calculates weight by dividing frequency per year by overall frequency of topic
topics_over_time["weight_per_topic"] = topics_over_time.groupby("Topic")["Frequency"].apply(lambda x: 100 * x / float(x.sum()))

# weight_per_timestamp = groups by timestamp, calculates weight by dividing frequency per topic by overall frequency of timestamp
topics_over_time["weight_per_timestamp"] = topics_over_time.groupby("Timestamp")["Frequency"].apply(lambda x: 100 * x / float(x.sum()))

In [21]:
topics_over_time.to_csv(ROOT / "data" / "processed" / "210818-bert-freq-dtm.csv", index=False)