In [1]:
###' ################################################################################
###'
###' IMPORT LIBRARIES
###'
###'

import numpy as np
import pandas as pd
from pathlib import Path
import random
import torch
from keybert import KeyBERT

import ast
import umap.umap_ as umap
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import SpectralEmbedding

### LDA
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Phrases

### BERT
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer




In [2]:
# Load dataset
data_dir = Path(r"C:\Users\Hyemi\Python\TopicModeling\Data")
data_dir.mkdir(parents=True, exist_ok=True) 

embedding_files = [
    "articles_4_clustering1.csv",
    "articles_4_clustering2.csv",
    "articles_4_clustering3.csv",
    "articles_4_clustering4.csv"
]

embedding_file_paths = [data_dir / file for file in embedding_files]


embedding_dfs = []
for file_path in embedding_file_paths:
    if file_path.exists():  # Check if the file exists before reading
        embedding_dfs.append(pd.read_csv(file_path))

df1a = embedding_dfs[0]
df2a = embedding_dfs[1]
df3a = embedding_dfs[2]
df4a = embedding_dfs[3]

## Group 1

In [6]:
df1 = df1a
df1['Cluster3'].value_counts()

Cluster3
0    3101
1     862
2     432
Name: count, dtype: int64

In [5]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df1['Abstract_join'].dropna().tolist()
cluster_labels = df1['Cluster3'].astype(int).tolist() 

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=4)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df1['Cluster3'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: individual difference, facial expression, work memory, control group, old adult, executive function, young adult, reaction time, positive negative, provide evidence
Cluster 1: music training, musical training, speech music, phonological awareness, listen music, music language, music perception, program note, music speech, music performance
Cluster 2: gender difference, spatial ability, gender information, noun phrase, grammatical gender, reflexive pronoun, stem career, stem field, genderfair language, engineering mathematic


In [7]:
topic_summary

{-1: [('work memory', 0.0020841158290305305),
  ('individual difference', 0.0013988292859779714),
  ('personality trait', 0.0010687225104195164),
  ('young adult', 0.0010544423958250503),
  ('control group', 0.0010368124306495991),
  ('old adult', 0.000996158218213982),
  ('eye movement', 0.0009729771159973683),
  ('provide evidence', 0.0008858833234556463),
  ('problem solve', 0.0008494044092251906),
  ('social interaction', 0.0008237780669456124)],
 0: [('individual difference', 0.0015546148615394864),
  ('facial expression', 0.0014649797669357694),
  ('work memory', 0.0013882477030026845),
  ('control group', 0.0013000090146234784),
  ('old adult', 0.0012724540617116789),
  ('executive function', 0.001177777603076478),
  ('young adult', 0.000991585782142074),
  ('reaction time', 0.0009532845479000437),
  ('positive negative', 0.0009290529799957143),
  ('provide evidence', 0.000865744321314817)],
 1: [('music training', 0.007419032852345174),
  ('musical training', 0.0059576774479091

## Group 2

In [8]:
df2 = df2a
df2['Cluster3'].value_counts()

Cluster3
0    3387
1    1432
2     945
Name: count, dtype: int64

In [16]:
###########  BEST so far

np.random.seed(42)
random.seed(42)
torch.manual_seed(42) 

abstracts = df2['Abstract_join'].dropna().tolist()
cluster_labels = df2['Cluster3'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=4)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df2['Cluster3'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, work memory, old adult, control group, individual difference, facial expression, structural equation, executive function, positive negative, young adult
Cluster 1: academic procrastination, utility value, bedtime procrastination, distal utility, distal utility value, procrastination scale, team procrastination, effort cost, peer attachment, selfregulatory resource
Cluster 2: political ideology, national identification, perceive threat, sacred value, collective action, action intention, political attitude, social exclusion, attitude immigrant, electoral support


## GROUP3

In [19]:
df3 = df3a
df3['Cluster9'].value_counts()

Cluster9
1    1683
6    1290
3    1283
2     747
5     674
0     599
8     351
7     291
4     184
Name: count, dtype: int64

In [14]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster7'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=8)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster7'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, college student, structural equation, physical activity, social support, university student, datum collect, emotion regulation, work engagement
Cluster 1: old adult, facial expression, native speaker, body image, significant difference, work memory, control group, young adult, virtual reality, eye movement
Cluster 2: soccer player, football player, sport performance, elite athlete, futsal player, physical activity, mental health, team sport, goal motive, significant difference
Cluster 3: purchase intention, climate change, social medium, proenvironmental behavior, social networking, structural equation, influence consumer, green product, sustainable development, environmental concern
Cluster 4: communicative act, sexual orientation, man woman, relationship quality, sexual behavior, lesbian woman, parenting intention, sexual satisfaction, woman man, sexual arousal
Cluster 5: sample size, cognitive diagnostic, cognitive diagnosis, item response, 

In [17]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster8'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=9)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster8'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, social support, structural equation, university student, work engagement, college student, depressive symptom, life satisfaction
Cluster 1: facial expression, old adult, native speaker, work memory, individual difference, control group, second language, negative emotion, social medium, eventrelate potential
Cluster 2: entrepreneurship education, entrepreneurial intention, college student, purchase intention, innovation entrepreneurship, student entrepreneurial, entrepreneurial selfefficacy, structural equation, psychological capital, new venture
Cluster 3: soccer player, football player, sport performance, mental health, elite athlete, goal motive, futsal player, significant difference, physical activity, training session
Cluster 4: music student, music performance, acoustic environment, music listen, musical instrument, old adult, performance anxiety, musical experience, music therapy, musical performance
Cluster 5: communic

In [18]:
np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df3['Abstract_join'].dropna().tolist()
cluster_labels = df3['Cluster9'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=10)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df3['Cluster9'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, covid pandemic, physical activity, social support, depression anxiety, online survey, depressive symptom, anxiety depression, coronavirus disease, perceive stress
Cluster 1: entrepreneurship education, college student, entrepreneurial intention, structural equation, datum collect, work engagement, psychological capital, job satisfaction, moderate relationship, confirmatory factor
Cluster 2: facial expression, native speaker, second language, word recognition, visual attention, work memory, emotion recognition, individual difference, emotion regulation, clause chain
Cluster 3: mental health, executive function, parent child, depressive symptom, child adolescent, young child, social anxiety, significant difference, child asd, emotion regulation
Cluster 4: social medium, purchase intention, climate change, datum collect, perceive value, stereotype threat, gender stereotype, social interaction, structural equation, communicative act
Cluster 5: music student, music

## GROUP4

In [12]:
df4 = df4a
df4['Cluster9'].value_counts()

Cluster9
0    3042
2    2193
5    1972
4    1350
1    1036
3    1021
8     647
7     646
6     268
Name: count, dtype: int64

In [15]:
###########  BEST so far

np.random.seed(40)
random.seed(40)
torch.manual_seed(40) 

abstracts = df4['Abstract_join'].dropna().tolist()
cluster_labels = df4['Cluster9'].astype(int).tolist()  

# Apply BERTopic with Predefined Cluster Labels
vectorizer_model = CountVectorizer(ngram_range=(2, 4), stop_words='english')
bertopic_model = BERTopic(vectorizer_model=vectorizer_model, ctfidf_model=ClassTfidfTransformer())

# Fit BERTopic using predefined cluster labels
topics, _ = bertopic_model.fit_transform(abstracts, y=cluster_labels)  
bertopic_model.reduce_topics(abstracts, nr_topics=10)  

# Store topics in dataframe
#df['topic'] = topics

# Get Top Words per Cluster
topic_summary = bertopic_model.get_topics()

# Print Cluster Topics
for cluster in sorted(df4['Cluster9'].unique()):
    if cluster in topic_summary:  # Ensure the cluster exists in the BERTopic output
        print(f"Cluster {cluster}: {', '.join([word[0] for word in topic_summary[cluster][:10]])}")

Cluster 0: mental health, physical activity, college student, social support, depressive symptom, covid pandemic, quality life, sleep quality, social medium, anxiety depression
Cluster 1: structural equation, work engagement, foreign language, college student, datum collect, job satisfaction, covid pandemic, online learning, mediate relationship, university student
Cluster 2: purchase intention, social medium, structural equation, consumer purchase, facial expression, emotion recognition, climate change, perceive value, behavioral intention, consumer purchase intention
Cluster 3: old adult, second language, work memory, executive function, child language, native speaker, heritage language, read comprehension, control group, significant difference
Cluster 4: entrepreneurial intention, entrepreneurship education, college student, green innovation, student entrepreneurial, innovation performance, business model, entrepreneurial performance, structural equation, significant positive
Cluste