In [1]:
from bertopic import BERTopic
from helpers import read_jsonl, meta_path, texts_path, Publication, ROOT

In [2]:
def retrieve_texts(data: Publication, field: str = "body_text") -> str:
    """Parse 'body_text' or 'abstract' fields extracting raw texts."""
    return {
        "id": data["paper_id"],
        "text": " ".join(section["text"] for section in data[field])
    }

In [3]:
data = [retrieve_texts(paper) for paper in read_jsonl(texts_path)]

In [4]:
ids = {entry["id"] for entry in data}

In [5]:
metadata = []
for paper in read_jsonl(meta_path):
    if paper["paper_id"] in ids and paper["year"] is not None:
        if paper["year"] >= 1991:
            metadata.append({
                "id": paper["paper_id"],
                "year": paper["year"]
            })

In [6]:
timestamps = [paper["year"] for paper in metadata]
texts = [paper["text"] for paper in data if paper["id"] in {i["id"] for i in metadata}]

In [7]:
assert len(texts) == len(timestamps)

# TM

In [8]:
topic_model = BERTopic(
    top_n_words=15, 
    low_memory=True, 
    calculate_probabilities=False, 
    verbose=True
)
topics, probs = topic_model.fit_transform(texts)

Batches:   0%|          | 0/286 [00:00<?, ?it/s]

2021-08-19 11:44:21,409 - BERTopic - Transformed documents to Embeddings
2021-08-19 11:45:08,845 - BERTopic - Reduced dimensionality with UMAP
2021-08-19 11:45:10,088 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [9]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,3514,-1_we_they_these_were
1,0,308,0_media_news_internet_information
2,1,287,1_eu_european_union_eus
3,2,268,2_migration_migrants_immigration_immigrants
4,3,255,3_law_court_legal_courts


In [10]:
freq

Unnamed: 0,Topic,Count,Name
0,-1,3514,-1_we_they_these_were
1,0,308,0_media_news_internet_information
2,1,287,1_eu_european_union_eus
3,2,268,2_migration_migrants_immigration_immigrants
4,3,255,3_law_court_legal_courts
...,...,...,...
125,124,11,124_fdi_investment_israeli_israel
126,125,11,125_latin_banks_regionalism_bank
127,126,11,126_china_asian_chinese_asia
128,127,11,127_indigenous_cree_peoples_slaves


In [11]:
topic_model.save("../models/210818-BERT.model")

In [12]:
freq.to_csv("../data/processed/210818-bert-freq-static.csv", index=False)

# DTM

In [13]:
topics_over_time = topic_model.topics_over_time(texts, topics, timestamps)

30it [14:05, 28.18s/it]


In [14]:
assert len(topics_over_time["Topic"].unique()) == len(freq["Topic"].unique())

In [15]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"gongos, medical, euthanasia, autonomy, ngos, a...",7,1991
1,0,"civic, technologies, users, citizen, digital, ...",1,1991
2,3,"law, hague, jurisdiction, laws, court, rule, l...",3,1991
3,6,"environmental, china, chinas, chinese, economi...",1,1991
4,11,"serb, military, croatia, war, yugoslavia, bosn...",2,1991
...,...,...,...,...
1765,102,"financial, banking, banks, growth, sector, eco...",1,2020
1766,104,"patent, patents, estonia, estonian, smes, inno...",1,2020
1767,117,"humanities, uu, students, university, interdis...",1,2020
1768,120,"piracy, danish, denmark, maritime, pirates, so...",1,2020


# Assigning weights

In [16]:
# weight_per_topic = groups by topic, calculates weight by dividing frequency per year by overall frequency of topic
topics_over_time["weight_per_topic"] = topics_over_time.groupby("Topic")["Frequency"].apply(lambda x: 100 * x / float(x.sum()))

# weight_per_timestamp = groups by timestamp, calculates weight by dividing frequency per topic by overall frequency of timestamp
topics_over_time["weight_per_timestamp"] = topics_over_time.groupby("Timestamp")["Frequency"].apply(lambda x: 100 * x / float(x.sum()))

In [17]:
topics_over_time.to_csv(ROOT / "data" / "processed" / "210818-bert-freq-dtm.csv", index=False)