In [57]:
pip install bertopic



In [60]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [61]:
import pandas as pd
data = pd.read_csv('drive/MyDrive/InfoQualityLab/WJH_citation_context_20240505.csv')
data = data[~data['ID'].isna()]
data.head()

Unnamed: 0,ID,Old_ID,DOI,Q1 - Review article? (Scopus),Q1 - Review article? (WoS),Q2 - Addendum or Neupane? (Scopus),Q2 - Addendum or Neupane? (WoS),Q3: Introduction section?,label in the reference list,Number of citation contexts,Citation marker,Citation context,Introduction section?,Author,Title,Year,Journal,Source,Other notes
0,1.0,0.0,10.1002/anie.201706532,n,n,n,n,n,10,1.0,10,We speculated that goupiolone B is biosyntheti...,n,"Matsuo Y., Yoshida A., Saito Y., Tanaka T.",Structural Revision and Biomimetic Synthesis o...,2017.0,ANGEWANDTE CHEMIE-INTERNATIONAL EDITION,"Scopus, Web of Science",
1,2.0,1.0,10.1002/anie.201708266,n,n,n,n,n,10,2.0,"Refs. 5, 9, and 10",Nuclear magnetic resonance (NMR) is the most i...,n,"Grimme S., Bannwarth C., Dohm S., Hansen A., P...",Fully Automated Quantum-Chemistry-Based Comput...,2017.0,ANGEWANDTE CHEMIE-INTERNATIONAL EDITION,"Scopus, Web of Science",
2,2.0,1.0,10.1002/anie.201708266,n,n,n,n,n,10,2.0,10,The crucial and thus far unsolved problem for ...,n,"Grimme S., Bannwarth C., Dohm S., Hansen A., P...",Fully Automated Quantum-Chemistry-Based Comput...,2017.0,ANGEWANDTE CHEMIE-INTERNATIONAL EDITION,"Scopus, Web of Science",
3,3.0,2.0,10.1002/anie.201810566,n,n,n,n,n,14,1.0,"13, 14",An additional complication in the justicane ca...,n,"Elkin M., Scruse A.C., Turlik A., Newhouse T.R.",Computational and Synthetic Investigation of C...,2019.0,ANGEWANDTE CHEMIE-INTERNATIONAL EDITION,"Scopus, Web of Science","Recheck, the other citation is the Gaussian"
4,4.0,3.0,10.1002/anie.201902777,n,n,n,n,n,4,1.0,4,Determination of structure is a fundamental pi...,n,"Kutateladze A.G., Krenske E.H., Williams C.M.",Reassignments and Corroborations of Oxo-Bridge...,2019.0,ANGEWANDTE CHEMIE-INTERNATIONAL EDITION,"Scopus, Web of Science",


In [62]:
docs = data[~data['Citation context'].isna()]['Citation context']

In [72]:
def get_coherence(docs, topics, topic_model):
  # Preprocess Documents
  documents = pd.DataFrame({"Document": docs,
                            "ID": range(len(docs)),
                            "Topic": topics})
  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

  # Extract vectorizer and analyzer from BERTopic
  vectorizer = topic_model.vectorizer_model
  analyzer = vectorizer.build_analyzer()

  # Extract features for Topic Coherence evaluation
  words = vectorizer.get_feature_names_out()
  tokens = [analyzer(doc) for doc in cleaned_docs]
  dictionary = corpora.Dictionary(tokens)
  corpus = [dictionary.doc2bow(token) for token in tokens]
  topic_words = [[words for words, _ in topic_model.get_topic(topic)]
                for topic in range(len(set(topics))-1)]

  # Evaluate
  coherence_model = CoherenceModel(topics=topic_words,
                                  texts=tokens,
                                  corpus=corpus,
                                  dictionary=dictionary,
                                  coherence='c_v')
  coherence = coherence_model.get_coherence()
  return coherence

In [116]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

min_topics = 2
max_topics = 11

kb = {}
dflt = {}

best_model = None
best_model_params = None
best_coherence = 0

results = []
for i in range(min_topics, max_topics + 1):
  cluster_model = KMeans(n_clusters=i)
  representation_model = KeyBERTInspired()

  topic_model = BERTopic(representation_model=representation_model, hdbscan_model=cluster_model)
  topics, _ = topic_model.fit_transform(docs)

  coherence = get_coherence(docs, topics, topic_model)
  results.append({'Keywords' : 'KeyBERTInspired',
                  'Num clusters' : i,
                  'Coherence' : coherence})
  kb[i] = topic_model

  if coherence > best_coherence:
    best_model = topic_model
    best_model_params = f'KeyBERTInspired keywords with {i} clusters'
    best_coherence = coherence

  cluster_model = KMeans(n_clusters=i)
  topic_model = BERTopic(hdbscan_model=cluster_model)
  topics, _ = topic_model.fit_transform(docs)
  vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
  topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

  coherence = get_coherence(docs, topics, topic_model)
  results.append({'Keywords' : 'CountVectorizer (1-3 ngram)',
                  'Num clusters' : i,
                  'Coherence' : coherence})
  dflt[i] = topic_model

  if coherence > best_coherence:
    best_model = topic_model
    best_model_params = f'CountVectorizer (1-3 ngram) keywords with {i} clusters'
    best_coherence = coherence

print(f'Best model: {best_model_params}')

Best model: CountVectorizer (1-3 ngram) keywords with 4 clusters


In [117]:
pd.DataFrame(results)

Unnamed: 0,Keywords,Num clusters,Coherence
0,KeyBERTInspired,2,0.418934
1,CountVectorizer (1-3 ngram),2,0.750863
2,KeyBERTInspired,3,0.411426
3,CountVectorizer (1-3 ngram),3,0.74666
4,KeyBERTInspired,4,0.383606
5,CountVectorizer (1-3 ngram),4,0.798641
6,KeyBERTInspired,5,0.426365
7,CountVectorizer (1-3 ngram),5,0.741236
8,KeyBERTInspired,6,0.44967
9,CountVectorizer (1-3 ngram),6,0.739206


In [118]:
best_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,157,0_using_conformers_calculated_level,"[using, conformers, calculated, level, nmr, ch...",[All molecular mechanics calculations were per...
1,1,136,1_nmr_chemical_shifts_chemical shifts,"[nmr, chemical, shifts, chemical shifts, metho...","[For more than a hundred years, single-crystal..."
2,2,95,2_nmr_chemical_data_configuration,"[nmr, chemical, data, configuration, shifts, 1...",[Compound 1 was obtained as a white amorphous ...
3,3,13,3_cal_tms_shielding_calculated,"[cal, tms, shielding, calculated, constants, s...",[NMR shielding constants were calculated with ...


Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Representative_document
0,We speculated that goupiolone B is biosyntheti...,2,2_nmr_chemical_data_configuration,"[nmr, chemical, data, configuration, shifts, 1...",[Compound 1 was obtained as a white amorphous ...,nmr - chemical - data - configuration - shifts...,False
1,Nuclear magnetic resonance (NMR) is the most i...,1,1_nmr_chemical_shifts_chemical shifts,"[nmr, chemical, shifts, chemical shifts, metho...","[For more than a hundred years, single-crystal...",nmr - chemical - shifts - chemical shifts - me...,False
2,The crucial and thus far unsolved problem for ...,0,0_using_conformers_calculated_level,"[using, conformers, calculated, level, nmr, ch...",[All molecular mechanics calculations were per...,using - conformers - calculated - level - nmr ...,False
3,An additional complication in the justicane ca...,2,2_nmr_chemical_data_configuration,"[nmr, chemical, data, configuration, shifts, 1...",[Compound 1 was obtained as a white amorphous ...,nmr - chemical - data - configuration - shifts...,False
4,Determination of structure is a fundamental pi...,2,2_nmr_chemical_data_configuration,"[nmr, chemical, data, configuration, shifts, 1...",[Compound 1 was obtained as a white amorphous ...,nmr - chemical - data - configuration - shifts...,False
...,...,...,...,...,...,...,...
396,The minimum energy conformations for each comp...,0,0_using_conformers_calculated_level,"[using, conformers, calculated, level, nmr, ch...",[All molecular mechanics calculations were per...,using - conformers - calculated - level - nmr ...,True
397,2D NMR spectroscopy has been used for the char...,1,1_nmr_chemical_shifts_chemical shifts,"[nmr, chemical, shifts, chemical shifts, metho...","[For more than a hundred years, single-crystal...",nmr - chemical - shifts - chemical shifts - me...,False
398,Relative Boltzmann populations are calculated ...,0,0_using_conformers_calculated_level,"[using, conformers, calculated, level, nmr, ch...",[All molecular mechanics calculations were per...,using - conformers - calculated - level - nmr ...,False
399,"As shown in Table 3, our calculations predict ...",0,0_using_conformers_calculated_level,"[using, conformers, calculated, level, nmr, ch...",[All molecular mechanics calculations were per...,using - conformers - calculated - level - nmr ...,False


In [136]:
# final sampling

total_required = 80
doc_topics = best_model.get_document_info(docs)
doc_topics['DOI'] = data['DOI']

ideal_count = int(total_required/len(best_model.get_topics()))
count_per_cluster = doc_topics.groupby('Topic').count()['Document'].reset_index()

sample = []
clusters_in_sample = set()
for index, row in count_per_cluster.iterrows():
  if row['Document'] <= ideal_count:
    sample.append(doc_topics[doc_topics['Topic'] == row['Topic']])
    clusters_in_sample.add(row['Topic'])

current_total = len(pd.concat(sample))
remaining_required = total_required - current_total
remaining_data = doc_topics[~doc_topics['Topic'].isin(clusters_in_sample)]
remaining_count = remaining_data.groupby('Topic').count()['Document'].reset_index()
remaining_count['percent'] = remaining_count['Document'] / sum(remaining_count['Document'])
remaining_count['count'] = remaining_count['percent'] * remaining_required
remaining_count['count'] = remaining_count['count'].round()

for index, row in remaining_count.iterrows():
  subset = remaining_data[remaining_data['Topic'] == row['Topic']]
  unique_subset = subset.groupby('DOI').apply(lambda df: df.sample(1))
  sample.append(unique_subset.sample(int(row['count'])))

sample = pd.concat(sample)
sample.groupby('Topic').count()['Document'].reset_index()

Unnamed: 0,Topic,Document
0,0,27
1,1,23
2,2,16
3,3,13


In [137]:
count_per_cluster

Unnamed: 0,Topic,Document
0,0,157
1,1,136
2,2,95
3,3,13


In [138]:
best_model.get_topic_info().to_csv('drive/MyDrive/InfoQualityLab/WJH_citation_context_20240505_bertopic_topics.csv')

In [140]:
sample.to_csv('drive/MyDrive/InfoQualityLab/WJH_citation_context_20240505_bertopic_sample.csv')