In [1]:
###' ################################################################################
###'
###' IMPORT LIBRARIES
###'
###'

import numpy as np
import pandas as pd
from pathlib import Path
import random
import torch
from keybert import KeyBERT

import ast
import umap.umap_ as umap
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import SpectralEmbedding

### LDA
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Phrases

### BERT
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer




In [2]:
# Load dataset
data_dir = Path(r"C:\Users\Hyemi\Python\TopicModeling\Data")
data_dir.mkdir(parents=True, exist_ok=True) 

embedding_files = [
    "articles_4_clustering1.csv",
    "articles_4_clustering2.csv",
    "articles_4_clustering3.csv",
    "articles_4_clustering4.csv"
]

embedding_file_paths = [data_dir / file for file in embedding_files]


embedding_dfs = []
for file_path in embedding_file_paths:
    if file_path.exists():  # Check if the file exists before reading
        embedding_dfs.append(pd.read_csv(file_path))

df1a = embedding_dfs[0]
df2a = embedding_dfs[1]
df3a = embedding_dfs[2]
df4a = embedding_dfs[3]

In [3]:
df3a

Unnamed: 0,Year,Abstract,Abstract_tokens,Abstract_join,Year_Group,Embeddings,Embeddings_S,Cluster3,Cluster4,Cluster5,Cluster8,Cluster9
0,2020,We describe a corpus of speech taking place be...,"['describe', 'corpus', 'speech', 'place', 'kor...",describe corpus speech place korean mother chi...,3,"[1.3425672054290771, 1.4400174617767334, 1.108...","[0.34810734772791113, 0.4677801044194296, 0.57...",0,0,3,7,3
1,2020,Although compassion in healthcare differs in i...,"['compassion', 'healthcare', 'differ', 'import...",compassion healthcare differ important way com...,3,"[1.5338462591171265, 1.530231237411499, 0.9366...","[0.7735476754291247, 0.8830654594875341, -0.47...",0,0,3,7,3
2,2020,This study explored the effect of learning str...,"['explore', 'learn', 'strategy', 'student', 'o...",explore learn strategy student organization ca...,3,"[1.4609277248382568, 1.5704612731933594, 0.914...","[0.6113632411120875, 1.0682582925833488, -0.60...",0,0,3,7,3
3,2020,"Using the Grounded theory, we took Chinese ent...","['ground', 'theory', 'chinese', 'entrepreneur'...",ground theory chinese entrepreneur object cons...,3,"[1.3701508045196533, 1.3691604137420654, 0.899...","[0.4094584220353579, 0.1416004922137856, -0.69...",0,0,4,4,4
4,2020,"With the outbreak of the COVID- crisis, the pu...","['outbreak', 'covid', 'crisis', 'public', 'get...",outbreak covid crisis public getting epidemicr...,3,"[1.3461201190948486, 1.4608646631240845, 0.855...","[0.3560096910588494, 0.563747015822865, -0.964...",0,1,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
7097,2021,BackgroundThe literature shows the negative ps...,"['backgroundthe', 'literature', 'negative', 'p...",backgroundthe literature negative psychologica...,3,"[1.1762337684631348, 1.5237650871276855, 0.911...","[-0.02184927097153048, 0.853299522992328, -0.6...",0,1,0,3,1
7098,2021,This study investigated Chinese university stu...,"['investigate', 'chinese', 'university', 'stud...",investigate chinese university student technol...,3,"[0.7362575531005859, 1.1006767749786377, 0.979...","[-1.0004385186465743, -1.0943229847643705, -0....",0,1,0,3,1
7099,2021,People often use concrete spatial terms to rep...,"['people', 'concrete', 'spatial', 'term', 'rep...",people concrete spatial term represent time me...,3,"[1.3095670938491821, 1.082215428352356, 0.9239...","[0.2747089383671882, -1.1793069778938658, -0.5...",0,1,0,2,2
7100,2021,This study departs from existing work on board...,"['depart', 'exist', 'work', 'board', 'gender',...",depart exist work board gender diversity corpo...,3,"[1.3322162628173828, 1.436128854751587, 0.8619...","[0.3250849188114781, 0.4498794951411282, -0.92...",0,1,0,3,1


## Group 1

In [4]:
df1 = df1a
print(df1['Cluster3'].value_counts())
print(df1['Cluster4'].value_counts())
print(df1['Cluster5'].value_counts())

Cluster3
0    3101
1     862
2     432
Name: count, dtype: int64
Cluster4
0    2115
3     986
1     862
2     432
Name: count, dtype: int64
Cluster5
0    1650
3     986
4     862
1     465
2     432
Name: count, dtype: int64


In [5]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df1['Abstract_join'].dropna().tolist()
cluster_labels = df1['Cluster3'].astype(int).tolist() 

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=4)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df1['Cluster3'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: facial expression, individual difference, positive negative, old adult, control group, work memory, prosocial behavior, emotion regulation, mental health, negative emotion
Cluster 1: work memory, executive function, control group, individual difference, old adult, working memory, speech perception, reaction time, second language, young adult
Cluster 2: gender difference, spatial ability, stem career, stem field, engineering mathematic, woman engineering, gender gap, science technology, science technology engineering, science technology engineering mathematic


In [22]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df1['Abstract_join'].dropna().tolist()
cluster_labels = df1['Cluster4'].astype(int).tolist() 

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df1['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: facial expression, old adult, healthy control, individual difference, control group, experiment participant, reaction time, emotional expression, spatial frequency, work memory
Cluster 1: speech perception, work memory, second language, auditory visual, native speaker, native language, speech sound, target word, individual difference, provide evidence
Cluster 2: young child, prosocial behavior, executive function, work memory, control group, child adult, depressive symptom, individual difference, group child, mother child
Cluster 3: physical activity, job satisfaction, psychological wellbeing, life satisfaction, academic achievement, structural equation, social support, confirmatory factor, individual difference, gender difference


In [None]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df1['Abstract_join'].dropna().tolist()
cluster_labels = df1['Cluster5'].astype(int).tolist() 

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df1['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

## Group 2

In [8]:
df2 = df2a
print(df2['Cluster3'].value_counts())
print(df2['Cluster4'].value_counts())
print(df2['Cluster5'].value_counts())

Cluster3
0    3387
1    1432
2     945
Name: count, dtype: int64
Cluster4
0    2229
1    1432
3    1158
2     945
Name: count, dtype: int64
Cluster5
3    1595
0    1432
1    1158
2     945
4     634
Name: count, dtype: int64


In [9]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df2['Abstract_join'].dropna().tolist()
cluster_labels = df2['Cluster3'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=4)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df2['Cluster3'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, work memory, old adult, control group, individual difference, executive function, structural equation, facial expression, positive negative, confirmatory factor
Cluster 1: entrepreneurial intention, entrepreneurship education, entrepreneurial education, entrepreneurial follower, entrepreneurial orientation, entrepreneurial passion, dark triad, university student, student entrepreneurial, entrepreneurial selfefficacy
Cluster 2: academic procrastination, utility value, bedtime procrastination, distal utility, distal utility value, team procrastination, procrastination scale, effort cost, peer attachment, selfregulatory resource


In [10]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df2['Abstract_join'].dropna().tolist()
cluster_labels = df2['Cluster4'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df2['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, control group, work memory, facial expression, individual difference, structural equation, positive negative, executive function, old adult, confirmatory factor
Cluster 1: entrepreneurial intention, entrepreneurship education, entrepreneurial follower, entrepreneurial education, entrepreneurial orientation, entrepreneurial passion, university student, dark triad, entrepreneurial selfefficacy, student entrepreneurial
Cluster 2: pet dog, dog owner, ot avp, domestic dog, behavior dog, oxtr gene, fear response, wolf dog, assistance dog, avp dog
Cluster 3: mobile advertising, intention purchase, purchase intention, design perception, plot story, price information, mobile shopping, consumer purchase, supplementary product, brand extension


In [11]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df2['Abstract_join'].dropna().tolist()
cluster_labels = df2['Cluster5'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df2['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, depressive symptom, facial expression, positive negative, confirmatory factor, personality trait, social support, individual difference, psychometric property, man woman
Cluster 1: work memory, old adult, executive function, working memory, child asd, reaction time, control group, young adult, typically develop, cognitive control
Cluster 2: job satisfaction, structural equation, practical implication, mediate relationship, datum collect, work engagement, physical activity, transformational leadership, job demand, positively relate
Cluster 3: mobile advertising, intention purchase, purchase intention, design perception, plot story, internet slang, price information, mobile shopping, consumer purchase, supplementary product
Cluster 4: domestic dog, ot avp, comt valmet, avp dog, social behavior, behavior dog, dog owner, pet dog, gaze behavior, japanese dog


## GROUP3

In [None]:
df3 = df3a
print(df3['Cluster4'].value_counts())
print(df3['Cluster5'].value_counts())
print(df3['Cluster8'].value_counts())
print(df3['Cluster9'].value_counts())

In [13]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster4'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, college student, structural equation, social support, datum collect, university student, work engagement, mediate relationship
Cluster 1: native speaker, old adult, second language, work memory, cognitive control, control group, reaction time, executive function, read comprehension, phonological awareness
Cluster 2: soccer player, football player, elite athlete, mental health, physical activity, futsal player, sport performance, significant difference, covid pandemic, goal motive
Cluster 3: sample size, cognitive diagnostic, item response, cognitive diagnosis, monte carlo, parameter estimate, classification accuracy, item parameter, cognitive diagnosis model, diagnosis model


In [14]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster5'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, college student, structural equation, social support, university student, datum collect, work engagement, depressive symptom
Cluster 1: purchase intention, old adult, facial expression, native speaker, social medium, second language, control group, climate change, cognitive control, eye movement
Cluster 2: soccer player, football player, physical activity, elite athlete, mental health, sport performance, significant difference, futsal player, covid pandemic, goal motive
Cluster 3: sample size, cognitive diagnostic, cognitive diagnosis, monte carlo, item response, parameter estimate, classification accuracy, item parameter, cognitive diagnosis model, diagnosis model
Cluster 4: tourism experience, tourism development, poverty alleviation, tourism product, tourism industry, visually impair, tourist satisfaction, sustainable tourism, place attachment, cultural heritage


In [15]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster8'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=9)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster8'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, structural equation, social support, college student, university student, control group, negative emotion, depressive symptom
Cluster 1: native speaker, phonological awareness, second language, read comprehension, word recognition, target word, phonological processing, word order, clause chain, language comprehension
Cluster 2: soccer player, football player, sport performance, elite athlete, mental health, goal motive, physical activity, futsal player, team sport, significant difference
Cluster 3: music student, music performance, acoustic environment, music listen, musical instrument, musical experience, old adult, performance anxiety, music therapy, instrumental music
Cluster 4: entrepreneurship education, entrepreneurial intention, college student, innovation entrepreneurship, student entrepreneurial, entrepreneurial selfefficacy, new venture, psychological capital, entrepreneurial psychology, innovation behavior
Cluster 

In [16]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster9'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=10)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster9'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, social support, depression anxiety, anxiety depression, depressive symptom, online survey, psychological wellbeing, university student
Cluster 1: mental health, executive function, child adolescent, parent child, covid pandemic, young child, emotion regulation, college student, structural equation, control group
Cluster 2: entrepreneurship education, entrepreneurial intention, college student, structural equation, datum collect, moderate relationship, work engagement, psychological capital, practical implication, purchase intention
Cluster 3: facial expression, native speaker, second language, work memory, visual attention, eye movement, individual difference, old adult, control group, emotion recognition
Cluster 4: soccer player, elite athlete, football player, sport performance, futsal player, mental health, significant difference, physical activity, team sport, covid pandemic
Cluster 5: music student, music performance, ac

## GROUP4

In [17]:
df4 = df4a
print(df4['Cluster4'].value_counts())
print(df4['Cluster5'].value_counts())
print(df4['Cluster9'].value_counts())
print(df4['Cluster10'].value_counts())

Cluster4
0    3407
2    3265
3    3042
1    2461
Name: count, dtype: int64
Cluster5
0    3265
3    3042
1    2461
2    2057
4    1350
Name: count, dtype: int64
Cluster9
0    3042
2    2193
5    1972
4    1350
1    1036
3    1021
8     647
7     646
6     268
Name: count, dtype: int64
Cluster10
3    2266
2    2193
5    1972
9    1350
0    1036
1    1021
4     776
8     647
7     646
6     268
Name: count, dtype: int64


In [18]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster4'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=5)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster4'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, college student, covid pandemic, social support, physical activity, structural equation, university student, datum collect, control group, mediate relationship
Cluster 1: entrepreneurial intention, purchase intention, structural equation, entrepreneurship education, college student, innovation performance, equation modeling, consumer purchase, significant positive, datum collect
Cluster 2: covid vaccine, covid vaccination, vaccination intention, vaccine uptake, vaccine hesitancy, covid vaccination intention, information need, covid vaccine uptake, fear covid, vaccine information
Cluster 3: psychedelic experience, attribution consciousness, archetype symbol, time travel, increase attribution, time travel past, category comprise, increase attribution consciousness, neardeath experience, travel past


In [19]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster5'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=6)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster5'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, college student, covid pandemic, structural equation, social support, datum collect, social medium, physical activity, mediate relationship, university student
Cluster 1: old adult, facial expression, emotion recognition, work memory, executive function, significant difference, young adult, second language, reaction time, control group
Cluster 2: elite athlete, soccer player, mental toughness, mental health, athlete burnout, athlete leadership, sport performance, significant difference, team sport, match performance
Cluster 3: academic procrastination, video addiction, short video addiction, physical activity, short video, work procrastination, time management, college student, procrastination behavior, procrastination scale
Cluster 4: covid vaccine, covid vaccination, vaccination intention, vaccine uptake, covid vaccination intention, vaccine hesitancy, information need, covid vaccine uptake, fear covid, vaccine information


In [20]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster9'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=10)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster9'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, social support, college student, physical activity, depressive symptom, covid pandemic, quality life, sleep quality, social medium, anxiety depression
Cluster 1: entrepreneurial intention, structural equation, purchase intention, social medium, datum collect, practical implication, equation modeling, moderate relationship, moderate role, mediate relationship
Cluster 2: old adult, facial expression, emotion recognition, executive function, work memory, second language, significant difference, young adult, control group, sign language
Cluster 3: foreign language, college student, online learning, university student, structural equation, academic performance, efl teacher, preservice teacher, student learn, english foreign
Cluster 4: music education, music performance, music listen, music training, music therapy, music teacher, performance anxiety, mental health, music performance anxiety, public performance
Cluster 5: athlete burnout, mental toughness, soccer pla

In [21]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster10'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=11)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster10'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, college student, physical activity, social support, depressive symptom, covid pandemic, old adult, quality life, social medium, life satisfaction
Cluster 1: college student, entrepreneurial intention, structural equation, covid pandemic, work engagement, datum collect, entrepreneurship education, university student, mediate relationship, job satisfaction
Cluster 2: purchase intention, social medium, structural equation, green innovation, consumer purchase, climate change, equation modeling, sustainable development, perceive value, consumer purchase intention
Cluster 3: facial expression, emotion recognition, old adult, second language, significant difference, young adult, speech perception, neural network, heritage speaker, individual difference
Cluster 4: parent child, young child, mental health, parenting style, preschool child, early childhood, social support, executive function, child asd, control group
Cluster 5: athlete burnout, soccer player, mental tou