In [3]:
# !pip install topmost pyyaml gensim ipywidgets widgetsnbextension pandas-profiling bokeh
!pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
# CUDA ver 12.1
# cuDNN ver 9.0.0

Looking in indexes: https://download.pytorch.org/whl/cu121


In [4]:
import random

import pandas as pd
import numpy as np
import topmost
import gensim
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_notebook
output_notebook()

from ECRTMhandler import DataHandler, Preprocessings


In [5]:
# 複数GPUがある場合は数値で選択可能
device = "cuda" # or "cpu"

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train

Unnamed: 0,Title,Excerpt,Category
0,"Uefa Opens Proceedings against Barcelona, Juve...",Uefa has opened disciplinary proceedings again...,sports
1,Amazon Blames Inflation as It Increases Cost o...,The increases are steeper than the 17 percent ...,business
2,Nigeria’s Parliament Passes Amended Electoral ...,Nigeria's Senate on Tuesday passed the harmoni...,politics
3,Nigeria: Lagos Governor Tests Positive for Cov...,"The Lagos State Governor, Mr. Babajide Sanwo-O...",health
4,South Africa Calls For Calm as Electoral Refor...,South Africa has raised concerns about the det...,politics
...,...,...,...
4681,Uganda Drops Charges Against Nigerian Singers ...,Nigerian singers Omah Lay and Tems are set to ...,entertainment
4682,NNPC Cuts Directors’ Fees by 60.2%,The Nigerian National Petroleum Corporation (N...,business
4683,Nigeria’s External Reserves Hit $37bn with $3....,Nigeria’s foreign reserves will get a timely b...,business
4684,Transcorp Hilton Wins Africa’s Leading Busines...,Transcorp Hilton Abuja has been honoured with ...,business


In [6]:
df_train["Category"].unique()

array(['sports', 'business', 'politics', 'health', 'tech',
       'entertainment'], dtype=object)

In [7]:
# preprocess raw data
stop_words_4_topmost = list(gensim.parsing.preprocessing.STOPWORDS)
preprocessing = Preprocessings(stopwords=stop_words_4_topmost)
parsed_texts, bow_matrix, vocab, word_embeddings = preprocessing.parse(texts=df_train["Excerpt"].to_list())
print(parsed_texts)

Found training documents 4686


===>parse texts: 100%|██████████| 4686/4686 [00:00<00:00, 9783.08it/s]


Real vocab size: 10026
===>convert to matrix...


===>making word embeddings: 100%|██████████| 10026/10026 [00:07<00:00, 1339.18it/s]

===> number of found embeddings: 9237/10026





In [17]:
########################### Neural Topic Models ####################################
# dataset for neural topic models
handler = DataHandler(bow_matrix, vocab=vocab, device=device)
# create a model
model = topmost.models.ECRTM(vocab_size=handler.vocab_size, num_topics=40, pretrained_WE=word_embeddings)
model = model.to(device)

# create a trainer
trainer = topmost.trainers.BasicTrainer(model, epochs=50, dataset_handler=handler, log_interval=10, learning_rate=0.0003)

# train the model
trainer.train()

cache clear
===>train_size:  3748
===>test_size:  938
===>vocab_size:  10026
===>average length: 11.678


 20%|██        | 10/50 [00:08<00:28,  1.38it/s]

Epoch: 010 loss: 158.413 loss_TM: 110.268 loss_ECR: 48.145


 40%|████      | 20/50 [00:15<00:21,  1.42it/s]

Epoch: 020 loss: 122.326 loss_TM: 105.302 loss_ECR: 17.024


 60%|██████    | 30/50 [00:22<00:14,  1.41it/s]

Epoch: 030 loss: 111.900 loss_TM: 104.299 loss_ECR: 7.601


 80%|████████  | 40/50 [00:30<00:07,  1.42it/s]

Epoch: 040 loss: 107.604 loss_TM: 103.193 loss_ECR: 4.412


100%|██████████| 50/50 [00:36<00:00,  1.35it/s]

Epoch: 050 loss: 105.593 loss_TM: 102.439 loss_ECR: 3.154





In [18]:
########################### Display top 15 words ####################################
# get top words of topics
top15_words = trainer.export_top_words(num_top_words=15)

Topic 0: case ambassador confirmed old woman ebola bishop person long diocese acting designate muhammed israel bird
Topic 1: bank central cbn nigeria nnpc nigerian emefiele corporation limited godwin petroleum plc kyari mele exchange
Topic 2: super coach germain eagles bayern munich cup game football europa atletico play uefa players dortmund
Topic 3: power china tennis huge economic strong australian support following efforts imposed financial property dutch far
Topic 4: congress muhammadu apc buhari progressives president ruling electoral biden senate joe inec independent senator vice
Topic 5: premier manchester league liverpool beat chelsea scored win city goals champions tottenham goal second arsenal
Topic 6: milan roma guardiola havertz lampard lacazette ancelotti topman scoring knockdown qualifiers alexandre silva rookie rangnick
Topic 7: award sounds collaborative notable breezy festival sussex society fetched levels patriotic racism ghostly multinational elders
Topic 8: elon mu

In [19]:
########################### Visualize by tSNE ####################################
topic_list_vec = list()
beta = trainer.export_beta()
for i, topic_dist in enumerate(beta):
    topic_list_vec.append(topic_dist.tolist())
topic_list_vec = np.array(topic_list_vec)
topic_list_vec_T = np.array(topic_list_vec).T
# TSNE実行(次元削減)
TSNEmodel = TSNE(n_components=2, perplexity=20, 
                    n_iter=2000, verbose=1, random_state=0)
TSNE_ECRTM_vectors = TSNEmodel.fit_transform(topic_list_vec_T)

[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 10026 samples in 0.001s...
[t-SNE] Computed neighbors for 10026 samples in 0.090s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10026
[t-SNE] Computed conditional probabilities for sample 2000 / 10026
[t-SNE] Computed conditional probabilities for sample 3000 / 10026
[t-SNE] Computed conditional probabilities for sample 4000 / 10026
[t-SNE] Computed conditional probabilities for sample 5000 / 10026
[t-SNE] Computed conditional probabilities for sample 6000 / 10026
[t-SNE] Computed conditional probabilities for sample 7000 / 10026
[t-SNE] Computed conditional probabilities for sample 8000 / 10026
[t-SNE] Computed conditional probabilities for sample 9000 / 10026
[t-SNE] Computed conditional probabilities for sample 10000 / 10026
[t-SNE] Computed conditional probabilities for sample 10026 / 10026
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 64.090134
[t-SNE] KL 

In [20]:
def get_color(seed):
    random.seed(seed)
    return "#%06x" % random.randint(0, 0xFFFFFF)

lda_keys = topic_list_vec_T.argmax(axis=1).tolist()

plot = figure(title="ECRTMsample", width=700, height=700)
plot.scatter(x=TSNE_ECRTM_vectors[:,0], y=TSNE_ECRTM_vectors[:,1], color=[get_color(key) for key in lda_keys])

show(plot)