## Installing the necessary packages and Loading data

In [3]:
%pip install bertopic
%pip install pandas

Collecting bertopicNote: you may need to restart the kernel to use updated packages.

  Using cached bertopic-0.16.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting numpy>=1.20.0 (from bertopic)
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Using cached hdbscan-0.8.36.tar.gz (6.1 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting umap-learn>=0.5.0 (from bertopic)
  Using cached umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pandas>=1.1.5 (from bertopic)
  Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn>=0.22.2.post1 (from bertopic)
  Using cached scikit_learn-1.5.0-cp3

  error: subprocess-exited-with-error
  
  × Building wheel for hdbscan (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [28 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cpython-312
      creating build\lib.win-amd64-cpython-312\hdbscan
      copying hdbscan\flat.py -> build\lib.win-amd64-cpython-312\hdbscan
      copying hdbscan\hdbscan_.py -> build\lib.win-amd64-cpython-312\hdbscan
      copying hdbscan\plots.py -> build\lib.win-amd64-cpython-312\hdbscan
      copying hdbscan\prediction.py -> build\lib.win-amd64-cpython-312\hdbscan
      copying hdbscan\robust_single_linkage_.py -> build\lib.win-amd64-cpython-312\hdbscan
      copying hdbscan\validity.py -> build\lib.win-amd64-cpython-312\hdbscan
      copying hdbscan\__init__.py -> build\lib.win-amd64-cpython-312\hdbscan
      creating build\lib.win-amd64-cpython-312\hdbscan\tests
      copying hdbscan\tests\test_flat.py

Collecting pandas
  Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl (15.5 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.2.2 pytz-2024.1 tzdata-2024.1
Note: you may need to restart the kernel to use updated packages.




In [None]:
import pandas as pd
from bertopic import BERTopic

In [None]:
# Load the data
df_r = pd.read_csv('df_r.csv')
df_w = pd.read_csv('df_w.csv')

## BERTopic Models

To analyze the MoM evolution of the topics covered in both Russian and Western tweets, we will use both LDA and BERT, and further compare the results. We will group tweets by month to have a representative overview that is not too computationally intensive (in terms of processing capacity and running time).

### BERTopic for Dynamic Topic Modelling - Russian Tweets

In [20]:
# BERT Model for Dynamic Topic Modelling - Russian tweets
timestamps = df_r["date"].to_list()
tweets_text_r = df_r["cleaned_tokens"].to_list()

dtm_bert_r = BERTopic(verbose=True) # to have an overview of how the model is running
topics_r, probs_r = dtm_bert_r.fit_transform(tweets_text_r)

topics_over_time_r = dtm_bert_r.topics_over_time(tweets_text_r, timestamps)

2024-05-24 00:04:39,668 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/707 [00:00<?, ?it/s]

2024-05-24 00:29:46,832 - BERTopic - Embedding - Completed ✓
2024-05-24 00:29:46,834 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-24 00:30:08,543 - BERTopic - Dimensionality - Completed ✓
2024-05-24 00:30:08,558 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-24 00:30:14,950 - BERTopic - Cluster - Completed ✓
2024-05-24 00:30:15,044 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-24 00:30:16,779 - BERTopic - Representation - Completed ✓
76it [00:23,  3.26it/s]


In [21]:
# Visualizing the evolution of topics in Russian tweets over time
dtm_bert_r.visualize_topics_over_time(topics_over_time_r, top_n_topics= 10)

In [None]:
# Printing the top 10 topics with their respective words and weights:
def print_top_topics(model, num_topics = 12):
    topic_info = model.get_topic_info()
    top_topic_ids = topic_info.head(num_topics)['Topic'].tolist()
    
    for topic_id in top_topic_ids:
        topic = model.get_topic(topic_id)
        print(f"Topic {topic_id}:")
        for word, weight in topic:
            print(f"  {word}: {weight:.4f}")
        print()

print_top_topics(dtm_bert_r, num_topics = 12)

### BERTopic for Dynamic Topic Modelling - Western Tweets

In [None]:
# BERT Model for Dynamic Topic Modelling - Western tweets
timestamps = df_w["date"].to_list()
tweets_text_w = df_w["cleaned_tokens"].to_list()

dtm_bert_w = BERTopic(verbose=True) # to have an overview of how the model is running
topics_w, probs_w = dtm_bert_w.fit_transform(tweets_text_w)

topics_over_time_w = dtm_bert_w.topics_over_time(tweets_text_w, timestamps)

In [None]:
# Visualizing the evolution of topics in Western tweets over time
dtm_bert_w.visualize_topics_over_time(topics_over_time_w, top_n_topics= 10)

In [None]:
# Printing the top 10 topics with their respective words and weights:
def print_top_topics(model, num_topics = 12):
    topic_info = model.get_topic_info()
    top_topic_ids = topic_info.head(num_topics)['Topic'].tolist()
    
    for topic_id in top_topic_ids:
        topic = model.get_topic(topic_id)
        print(f"Topic {topic_id}:")
        for word, weight in topic:
            print(f"  {word}: {weight:.4f}")
        print()

print_top_topics(dtm_bert_w, num_topics = 12)