In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import glob, pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

from bertopic import BERTopic

In [5]:
import os

DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
BERTOPIC_DIR = os.path.join(OUTPUT_DIR,"BERTopic")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(BERTOPIC_DIR):
    os.makedirs(BERTOPIC_DIR)

In [6]:
nltk.download("popular")
sws = stopwords.words("english") + ["n't",  "'s", "'ve"]

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [7]:
docs = []
df = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
df["text"] = df["title"] + " " + df["abstract"] 
for fl in tqdm(df['text']):
    word_tokens = word_tokenize(fl)
    txt = " ".join([w for w in word_tokens if not w.lower() in sws])
    docs.append(txt)

100%|██████████| 4974/4974 [00:19<00:00, 251.74it/s]


In [8]:
topic_model = BERTopic(n_gram_range=(1, 3), top_n_words=5, verbose=True)
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/156 [00:00<?, ?it/s]

2023-04-21 13:27:15,586 - BERTopic - Transformed documents to Embeddings
2023-04-21 13:27:54,668 - BERTopic - Reduced dimensionality
2023-04-21 13:27:54,866 - BERTopic - Clustered reduced embeddings


In [9]:
tm_meta = topic_model.get_topic_info()
tm_meta.to_csv(os.path.join(BERTOPIC_DIR,"topic_model_metadata.csv"), index=False)
display(tm_meta)

pred_topics = pd.DataFrame()
pred_topics["id"] = df['id']
pred_topics["topic"] = topics
pred_topics['prob'] = probs
pred_topics.to_csv(os.path.join(BERTOPIC_DIR,"topic_model_article.csv"), index=False)
pred_topics

Unnamed: 0,Topic,Count,Name
0,-1,1164,-1_neural_training_networks_model
1,0,765,0_learning_reinforcement_reinforcement learnin...
2,1,318,1_adversarial_attacks_robustness_adversarial e...
3,2,197,2_gans_generative_gan_generative adversarial
4,3,155,3_graph_node_graphs_nodes
...,...,...,...
65,64,12,64_quantum_entanglement_entanglement entropy_m...
66,65,11,65_language_biases_models_bias
67,66,10,66_convolutional_networks_cnns_network
68,67,10,67_sgd_hessian_stochastic_noise


Unnamed: 0,id,topic,prob
0,1,53,0.927143
1,2,30,0.788502
2,3,3,0.826212
3,4,22,0.343145
4,5,-1,0.000000
...,...,...,...
4969,4970,-1,0.000000
4970,4971,5,0.941278
4971,4972,1,0.789627
4972,4973,8,0.519015


In [10]:
topic_model.save(os.path.join(BERTOPIC_DIR,"article_topic_model"))

In [None]:
df = df.merge(pred_topics,how="left",on="id")
df.groupby("topic")["y"].mean()