In [1]:
###' ################################################################################
###'
###' IMPORT LIBRARIES
###'
###'

import numpy as np
import pandas as pd
from pathlib import Path
import random
import torch
from keybert import KeyBERT

import ast
import umap.umap_ as umap
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import SpectralEmbedding

### LDA
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Phrases

### BERT
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer




In [2]:
# Load dataset
data_dir = Path(r"C:\Users\Hyemi\Python\TopicModeling\Data")
data_dir.mkdir(parents=True, exist_ok=True) 

embedding_files = [
    "articles_4_clustering1.csv",
    "articles_4_clustering2.csv",
    "articles_4_clustering3.csv",
    "articles_4_clustering4.csv"
]

embedding_file_paths = [data_dir / file for file in embedding_files]


embedding_dfs = []
for file_path in embedding_file_paths:
    if file_path.exists():  # Check if the file exists before reading
        embedding_dfs.append(pd.read_csv(file_path))

df1a = embedding_dfs[0]
df2a = embedding_dfs[1]
df3a = embedding_dfs[2]
df4a = embedding_dfs[3]

In [3]:
df3a

Unnamed: 0,Year,Abstract,Abstract_tokens,Abstract_join,Year_Group,Embeddings,Embeddings_S,Cluster3,Cluster4,Cluster5,Cluster8,Cluster9
0,2020,We describe a corpus of speech taking place be...,"['describe', 'corpus', 'speech', 'place', 'kor...",describe corpus speech place korean mother chi...,3,"[1.3425672054290771, 1.4400174617767334, 1.108...","[0.34810734772791113, 0.4677801044194296, 0.57...",0,0,3,7,3
1,2020,Although compassion in healthcare differs in i...,"['compassion', 'healthcare', 'differ', 'import...",compassion healthcare differ important way com...,3,"[1.5338462591171265, 1.530231237411499, 0.9366...","[0.7735476754291247, 0.8830654594875341, -0.47...",0,0,3,7,3
2,2020,This study explored the effect of learning str...,"['explore', 'learn', 'strategy', 'student', 'o...",explore learn strategy student organization ca...,3,"[1.4609277248382568, 1.5704612731933594, 0.914...","[0.6113632411120875, 1.0682582925833488, -0.60...",0,0,3,7,3
3,2020,"Using the Grounded theory, we took Chinese ent...","['ground', 'theory', 'chinese', 'entrepreneur'...",ground theory chinese entrepreneur object cons...,3,"[1.3701508045196533, 1.3691604137420654, 0.899...","[0.4094584220353579, 0.1416004922137856, -0.69...",0,0,4,4,4
4,2020,"With the outbreak of the COVID- crisis, the pu...","['outbreak', 'covid', 'crisis', 'public', 'get...",outbreak covid crisis public getting epidemicr...,3,"[1.3461201190948486, 1.4608646631240845, 0.855...","[0.3560096910588494, 0.563747015822865, -0.964...",0,1,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
7097,2021,BackgroundThe literature shows the negative ps...,"['backgroundthe', 'literature', 'negative', 'p...",backgroundthe literature negative psychologica...,3,"[1.1762337684631348, 1.5237650871276855, 0.911...","[-0.02184927097153048, 0.853299522992328, -0.6...",0,1,0,3,1
7098,2021,This study investigated Chinese university stu...,"['investigate', 'chinese', 'university', 'stud...",investigate chinese university student technol...,3,"[0.7362575531005859, 1.1006767749786377, 0.979...","[-1.0004385186465743, -1.0943229847643705, -0....",0,1,0,3,1
7099,2021,People often use concrete spatial terms to rep...,"['people', 'concrete', 'spatial', 'term', 'rep...",people concrete spatial term represent time me...,3,"[1.3095670938491821, 1.082215428352356, 0.9239...","[0.2747089383671882, -1.1793069778938658, -0.5...",0,1,0,2,2
7100,2021,This study departs from existing work on board...,"['depart', 'exist', 'work', 'board', 'gender',...",depart exist work board gender diversity corpo...,3,"[1.3322162628173828, 1.436128854751587, 0.8619...","[0.3250849188114781, 0.4498794951411282, -0.92...",0,1,0,3,1


## Group 1

In [4]:
df1 = df1a
print(df1['Cluster3'].value_counts())
print(df1['Cluster4'].value_counts())
print(df1['Cluster5'].value_counts())

Cluster3
0    3101
1     862
2     432
Name: count, dtype: int64
Cluster4
0    2115
3     986
1     862
2     432
Name: count, dtype: int64
Cluster5
0    1650
3     986
4     862
1     465
2     432
Name: count, dtype: int64


In [5]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df1['Abstract_join'].dropna().tolist()
cluster_labels = df1['Cluster3'].astype(int).tolist() 

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=4)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df1['Cluster3'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: work memory, facial expression, individual difference, old adult, control group, executive function, young adult, reaction time, positive negative, provide evidence
Cluster 1: music training, musical training, speech music, phonological awareness, listen music, music language, program note, music speech, music performance, emotional response
Cluster 2: gender difference, spatial ability, role noun, gender information, noun phrase, reflexive pronoun, grammatical gender, genderfair language, stem career, man woman


In [6]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df1['Abstract_join'].dropna().tolist()
cluster_labels = df1['Cluster4'].astype(int).tolist() 

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df1['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: work memory, individual difference, facial expression, control group, old adult, executive function, reaction time, experiment participant, provide evidence, participant perform
Cluster 1: job satisfaction, prosocial behavior, depressive symptom, social support, physical activity, adult attachment, mental health, attachment style, individual difference, confirmatory factor
Cluster 2: music training, speech music, musical training, piano performance, music performance, music perception, program note, music speech, expressive bodily, bodily movement
Cluster 3: spatial frequency, cortical area, luminance step, figure ground, grid cell, surface quality, color discrimination, contour shape, local contour, border ownership


In [7]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df1['Abstract_join'].dropna().tolist()
cluster_labels = df1['Cluster5'].astype(int).tolist() 

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df1['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: facial expression, individual difference, work memory, control group, executive function, old adult, depressive symptom, positive negative, young adult, reaction time
Cluster 1: work memory, speech perception, second language, auditory visual, native speaker, speech sound, lexical decision, musical training, eventrelate potential, word recognition
Cluster 2: gender difference, spatial ability, stem career, stem field, engineering mathematic, word pair, woman engineering, angrymale bias, gender gap, science technology
Cluster 3: zebra finch, selfother integration, vocal interaction, wolf dog, social network, fuzzy clustering, social structure, spatial distribution, physical size, social learning
Cluster 4: odor pleasantness, olfactory receptor, body odor, auditory modality, olfactory cognition, fragrance condition, sign disorder, anterior medial, odor identification, pleasantness rating


## Group 2

In [8]:
df2 = df2a
print(df2['Cluster3'].value_counts())
print(df2['Cluster4'].value_counts())
print(df2['Cluster5'].value_counts())

Cluster3
0    3387
1    1432
2     945
Name: count, dtype: int64
Cluster4
0    2229
1    1432
3    1158
2     945
Name: count, dtype: int64
Cluster5
3    1595
0    1432
1    1158
2     945
4     634
Name: count, dtype: int64


In [9]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df2['Abstract_join'].dropna().tolist()
cluster_labels = df2['Cluster3'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=4)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df2['Cluster3'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, control group, old adult, work memory, facial expression, structural equation, executive function, confirmatory factor, positive negative, individual difference
Cluster 1: lexical tone, work memory, second language, bilingual child, relative clause, inhibitory control, novel word, language dominance, old adult, speech perception
Cluster 2: academic procrastination, utility value, bedtime procrastination, distal utility, distal utility value, procrastination scale, team procrastination, effort cost, peer attachment, selfregulatory resource


In [10]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df2['Abstract_join'].dropna().tolist()
cluster_labels = df2['Cluster4'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df2['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, work memory, control group, executive function, old adult, structural equation, facial expression, individual difference, positive negative, confirmatory factor
Cluster 1: entrepreneurial intention, entrepreneurship education, entrepreneurial follower, entrepreneurial education, entrepreneurial orientation, entrepreneurial passion, university student, dark triad, entrepreneurial selfefficacy, student entrepreneurial
Cluster 2: pet dog, dog owner, domestic dog, ot avp, behavior dog, oxtr gene, fear response, avp dog, assistance dog, wolf dog
Cluster 3: rewarddelay impulsivity, adolescent gambling, venue terminal, cognitive distortion, net loss, rapidresponse impulsivity, dysfunctional impulsivity, gambling behavior, impulsivity rewarddelay impulsivity, deposit limit


In [11]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df2['Abstract_join'].dropna().tolist()
cluster_labels = df2['Cluster5'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df2['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: facial expression, mental health, positive negative, emotion regulation, personality trait, depressive symptom, eat disorder, man woman, social support, gender difference
Cluster 1: structural equation, work engagement, job satisfaction, mediate relationship, confirmatory factor, practical implication, datum collect, equation modeling, structural equation modeling, physical activity
Cluster 2: work memory, old adult, executive function, working memory, child asd, reaction time, control group, young adult, typically develop, cognitive control
Cluster 3: musical instrument, music performance, acoustic feature, musical training, practice session, music listen, musical feature, music training, group music, musical excerpt
Cluster 4: domestic dog, ot avp, avp dog, comt valmet, social behavior, behavior dog, dog owner, pet dog, gaze behavior, japanese dog


## GROUP3

In [12]:
df3 = df3a
print(df3['Cluster4'].value_counts())
print(df3['Cluster5'].value_counts())
print(df3['Cluster8'].value_counts())
print(df3['Cluster9'].value_counts())

Cluster4
1    3029
3    1932
0    1467
2     674
Name: count, dtype: int64
Cluster5
0    3029
1    1932
3    1283
2     674
4     184
Name: count, dtype: int64
Cluster8
3    1683
6    1290
7    1283
2     747
5     674
0     642
1     599
4     184
Name: count, dtype: int64
Cluster9
1    1683
6    1290
3    1283
2     747
5     674
0     599
8     351
7     291
4     184
Name: count, dtype: int64


In [13]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster4'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, college student, structural equation, social support, university student, datum collect, entrepreneurship education, depressive symptom
Cluster 1: facial expression, purchase intention, old adult, social medium, native speaker, second language, work memory, significant difference, eye movement, cognitive control
Cluster 2: soccer player, football player, elite athlete, mental health, futsal player, significant difference, sport performance, physical activity, covid pandemic, team sport
Cluster 3: music student, music performance, music listen, acoustic environment, musical experience, music therapy, performance anxiety, aesthetic judgment, instrumental music, musical performance


In [14]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster5'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, college student, physical activity, structural equation, social support, university student, work engagement, datum collect, depressive symptom
Cluster 1: purchase intention, facial expression, old adult, second language, native speaker, work memory, social medium, cognitive control, individual difference, climate change
Cluster 2: soccer player, physical activity, football player, elite athlete, mental health, sport performance, futsal player, significant difference, covid pandemic, goal motive
Cluster 3: music student, music performance, music listen, performance anxiety, old adult, musical experience, musical performance, music therapy, aesthetic judgment, instrumental music
Cluster 4: virtual reality, art psychomotor, psychomotor therapy, art psychomotor therapy, shot performance, motion sickness, vr health, immersive experience, successful aging, vr health experience


In [15]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster8'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=9)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster8'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, social support, structural equation, work engagement, university student, life satisfaction, college student, depressive symptom
Cluster 1: facial expression, old adult, native speaker, negative emotion, individual difference, work memory, social medium, emotion regulation, control group, second language
Cluster 2: entrepreneurship education, entrepreneurial intention, purchase intention, college student, innovation entrepreneurship, structural equation, student entrepreneurial, entrepreneurial selfefficacy, psychological capital, new venture
Cluster 3: soccer player, football player, sport performance, elite athlete, mental health, physical activity, goal motive, futsal player, team sport, significant difference
Cluster 4: music student, music performance, acoustic environment, music listen, musical instrument, musical experience, old adult, frequency band, performance anxiety, music therapy
Cluster 5: communicative act, gen

In [16]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster9'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=10)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster9'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: entrepreneurship education, college student, work engagement, entrepreneurial intention, structural equation, datum collect, psychological capital, job satisfaction, mediate relationship, moderate relationship
Cluster 1: mental health, covid pandemic, physical activity, depression anxiety, anxiety depression, social support, depressive symptom, online survey, perceive stress, coronavirus disease
Cluster 2: native speaker, second language, work memory, visual attention, old adult, control group, word recognition, individual difference, clause chain, significant difference
Cluster 3: social medium, purchase intention, facial expression, climate change, emotion recognition, emotion regulation, structural equation, perceive value, gender stereotype, datum collect
Cluster 4: mental health, executive function, depressive symptom, child adolescent, young child, parent child, social anxiety, child asd, autism spectrum, college student
Cluster 5: soccer player, elite athlete, footbal

## GROUP4

In [17]:
df4 = df4a
print(df4['Cluster4'].value_counts())
print(df4['Cluster5'].value_counts())
print(df4['Cluster9'].value_counts())
print(df4['Cluster10'].value_counts())

Cluster4
0    3407
2    3265
3    3042
1    2461
Name: count, dtype: int64
Cluster5
0    3265
3    3042
1    2461
2    2057
4    1350
Name: count, dtype: int64
Cluster9
0    3042
2    2193
5    1972
4    1350
1    1036
3    1021
8     647
7     646
6     268
Name: count, dtype: int64
Cluster10
3    2266
2    2193
5    1972
9    1350
0    1036
1    1021
4     776
8     647
7     646
6     268
Name: count, dtype: int64


In [18]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster4'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: structural equation, college student, datum collect, entrepreneurial intention, equation modeling, mediate relationship, job satisfaction, structural equation modeling, purchase intention, practical implication
Cluster 1: mental health, college student, physical activity, social support, covid pandemic, sleep quality, quality life, depressive symptom, anxiety depression, university student
Cluster 2: old adult, facial expression, emotion recognition, significant difference, second language, work memory, young adult, executive function, control group, individual difference
Cluster 3: covid vaccine, covid vaccination, vaccination intention, vaccine uptake, covid vaccination intention, vaccine hesitancy, information need, covid vaccine uptake, fear covid, vaccine information


In [19]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster5'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, college student, covid pandemic, structural equation, social support, datum collect, physical activity, social medium, mediate relationship, mediating role
Cluster 1: old adult, facial expression, work memory, emotion recognition, executive function, significant difference, young adult, second language, control group, cognitive function
Cluster 2: video game, school bullying, cyberbullye perpetration, moral disengagement, cyberbullye victimization, school student, bully victimization, social medium, prosocial behavior, mental health
Cluster 3: academic procrastination, video addiction, short video addiction, physical activity, short video, work procrastination, time management, college student, procrastination behavior, procrastination scale
Cluster 4: covid vaccine, covid vaccination, vaccination intention, vaccine uptake, covid vaccination intention, vaccine hesitancy, information need, covid vaccine uptake, fear covid, vaccine information


In [20]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster9'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=10)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster9'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, social support, physical activity, college student, covid pandemic, depressive symptom, quality life, social medium, sleep quality, anxiety depression
Cluster 1: structural equation, work engagement, college student, job satisfaction, datum collect, covid pandemic, emotional intelligence, foreign language, mediate relationship, equation modeling
Cluster 2: old adult, facial expression, emotion recognition, work memory, significant difference, second language, executive function, sign language, young adult, control group
Cluster 3: entrepreneurial intention, purchase intention, structural equation, social medium, college student, entrepreneurship education, equation modeling, significant positive, structural equation modeling, datum collect
Cluster 4: music education, music performance, music listen, music training, music therapy, music teacher, performance anxiety, flow state, public performance, musical instrument
Cluster 5: academic procrastination, video ad

In [21]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster10'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=11)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster10'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, college student, covid pandemic, social support, physical activity, depressive symptom, structural equation, university student, mediate relationship, quality life
Cluster 1: old adult, facial expression, emotion recognition, second language, work memory, significant difference, young adult, executive function, child language, heritage language
Cluster 2: purchase intention, structural equation, green innovation, consumer purchase, perceive value, social medium, equation modeling, consumer purchase intention, structural equation modeling, short video
Cluster 3: entrepreneurial intention, entrepreneurship education, college student, student entrepreneurial, entrepreneurial performance, entrepreneurial behavior, entrepreneurial selfefficacy, entrepreneurial orientation, psychological capital, innovation entrepreneurship
Cluster 4: social medium, gender stereotype, fake news, sexual orientation, sexual minority, prosocial behavior, social norm, autonomous vehicle