### Dynamic Topic Modeling for PLOS topics
This notebook presents the analysis of pychology-related topics published in PLOS over time.


In [15]:
import pandas as pd
#%ls
data = pd.read_csv(r'C:\Users\karol\projects\plosAnalysis\plosOnePsychologyMetrics.csv')
# #data.head()

In [16]:
data.head()

Unnamed: 0,id,title,abstract,journal,publication_date,received_date,accepted_date,counter_total_all,counter_total_month,citation_count
0,10.1371/journal.pone.0236792,The link between childhood psychological maltr...,"['\nBased on Attachment Theory, the Barlett an...",PLOS ONE,2020-09-03T00:00:00Z,2020-01-09T00:00:00Z,2020-07-06T00:00:00Z,2465,48,14.0
1,10.1371/journal.pone.0280457,"Need for affect, need for cognition, and the d...",['\nThe last decade has witnessed a significan...,PLOS ONE,2023-02-09T00:00:00Z,2021-10-08T00:00:00Z,2023-01-03T00:00:00Z,465,25,0.0
2,10.1371/journal.pone.0192907,The dominance of introspective measures and wh...,"['\nThe behavioral sciences, including most of...",PLOS ONE,2018-02-15T00:00:00Z,2016-11-16T00:00:00Z,2018-02-01T00:00:00Z,2430,17,27.0
3,10.1371/journal.pone.0224326,Does the psychological profile influence the p...,['\nStress control as well as other psychologi...,PLOS ONE,2019-11-12T00:00:00Z,2019-05-14T00:00:00Z,2019-10-11T00:00:00Z,2639,17,7.0
4,10.1371/journal.pone.0245671,The effect of perceived interracial competitio...,['\nThere remains a dearth of research on caus...,PLOS ONE,2021-01-29T00:00:00Z,2020-07-18T00:00:00Z,2021-01-05T00:00:00Z,2861,22,4.0


In [19]:
data[data['title'].duplicated(keep=False)].sort_values('title').head(8)
#drop duplicates
data = data.drop_duplicates('title')
#drop irrelevant titles
data = data[~data['title'].str.contains('Issue Image | Vol.')]

In [21]:
#import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))

def remove_stopwords(text) -> str:
   #Remove stopwords from text """
    filtered_words = [word for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)
data.loc[:, 'textClean'] = data['title'].apply(remove_stopwords)
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,title,abstract,journal,publication_date,received_date,accepted_date,counter_total_all,counter_total_month,citation_count,textClean
0,10.1371/journal.pone.0236792,The link between childhood psychological maltr...,"['\nBased on Attachment Theory, the Barlett an...",PLOS ONE,2020-09-03T00:00:00Z,2020-01-09T00:00:00Z,2020-07-06T00:00:00Z,2465,48,14.0,link childhood psychological maltreatment cybe...
1,10.1371/journal.pone.0280457,"Need for affect, need for cognition, and the d...",['\nThe last decade has witnessed a significan...,PLOS ONE,2023-02-09T00:00:00Z,2021-10-08T00:00:00Z,2023-01-03T00:00:00Z,465,25,0.0,"Need affect, need cognition, desire independence"
2,10.1371/journal.pone.0192907,The dominance of introspective measures and wh...,"['\nThe behavioral sciences, including most of...",PLOS ONE,2018-02-15T00:00:00Z,2016-11-16T00:00:00Z,2018-02-01T00:00:00Z,2430,17,27.0,dominance introspective measures implies: exam...
3,10.1371/journal.pone.0224326,Does the psychological profile influence the p...,['\nStress control as well as other psychologi...,PLOS ONE,2019-11-12T00:00:00Z,2019-05-14T00:00:00Z,2019-10-11T00:00:00Z,2639,17,7.0,psychological profile influence position promi...
4,10.1371/journal.pone.0245671,The effect of perceived interracial competitio...,['\nThere remains a dearth of research on caus...,PLOS ONE,2021-01-29T00:00:00Z,2020-07-18T00:00:00Z,2021-01-05T00:00:00Z,2861,22,4.0,effect perceived interracial competition psych...


In [29]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
#bert topic is very slow so selecting only 10% of data first
# Determine the number of rows to select (10% of the total rows)
subset_size = int(len(data) * 0.1)

# Randomly select 10% of the rows
random_subset = data.sample(n=subset_size, random_state=42)  # Setting random_state for reproducibility

# Create a new DataFrame with the selected rows
subset10 = pd.DataFrame(random_subset)

In [38]:
#docs = data['textClean']
# If you are passing strings (dates) instead of integers, then BERTopic will try to automatically 
# detect which datetime format your strings have. Unfortunately, this will not always work if 
# they are in an unexpected format. We can use datetime_format to pass the format the timestamps have:
timestamps = subset10.publication_date.to_list()
titles = subset10.textClean.to_list()

# import random
# docs_list = list(docs)

# # Determine the size of the subset (10% of the data)
# subset_size = int(len(docs_list) * 0.1)

# # Randomly select 10% of the data
# random_subset = random.sample(docs_list, subset_size)



In [39]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(titles)

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:21<00:00,  2.06it/s]
2023-07-28 16:33:09,266 - BERTopic - Transformed documents to Embeddings
2023-07-28 16:33:26,487 - BERTopic - Reduced dimensionality
2023-07-28 16:33:26,788 - BERTopic - Clustered reduced embeddings


In [40]:
# From these topics, we are going to generate the topic representations at each 
# timestamp for each topic. We do this by simply calling 
# topics_over_time and passing the titles, 
# the corresponding timestamps, and the related topics:
topics_over_time = topic_model.topics_over_time(titles, timestamps, nr_bins=20)

20it [00:00, 20.42it/s]


In [41]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [42]:
freq = topic_model.get_topic_info(); freq.head(5)
#try to 

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,612,-1_social_learning_among_cognitive,"[social, learning, among, cognitive, study, effects, behavior, using, children, task]","[Quality life associated factors among youth substance use Northwest Ethiopia: Using structural equation modeling, Training spiking neuronal network model visual-motor cortex play virtual racket-ball game using reinforcement learning, association disordered eating health-related quality life among children adolescents: systematic review population-based studies]"
1,0,84,0_covid19_pandemic_burnout_health,"[covid19, pandemic, burnout, health, psychological, distress, mental, among, healthcare, care]","[Burnout coping strategies among resident physicians Indonesian tertiary referral hospital COVID-19 pandemic, Job burnout among Israeli healthcare workers first months COVID-19 pandemic: role emotion regulation strategies psychological distress, Mental health, risk perception, coping strategies among healthcare workers Egypt COVID-19 pandemic]"
2,1,54,1_touch_perception_body_vestibular,"[touch, perception, body, vestibular, postural, orientation, binocular, ownership, visuomotor, hand]","[Viewing Pain Happy Faces Elicited Similar Changes Postural Body Sway, Direction Specific Biases Human Visual Vestibular Heading Perception, Changes perception upright body orientation age]"
3,2,47,2_intergroup_social_ingroup_identity,"[intergroup, social, ingroup, identity, humanization, peace, outgroup, aversion, leadership, increases]","[activating diversity alleviate, increase intergroup bias? ingroup projection perspective, Priming attachment security outgroup humanization: mediation role intergroup emotions, humanitarian aid humanization: outgroup, ingroup, helping increases humanization]"
4,3,46,3_stress_coping_depression_life,"[stress, coping, depression, life, study, wellbeing, adults, crosssectional, perceived, symptoms]","[Trier Social Stress Test Trier Social Stress Test groups: Qualitative investigations, Stress Effects Mood, HPA Axis, Autonomic Response: Comparison Three Psychosocial Stress Paradigms, Coexistence Coping Resources Specific Coping Styles Stress: Evidence Full Information Item Bifactor Analysis]"


In [49]:
# Print the number of topics identified
topic_freq = topic_model.get_topic_freq()
num_topics = len(topic_freq)
print("Number of topics identified:", num_topics)

Number of topics identified: 28


In [43]:
#I need to clean the data a bit better, investigate the -1 and 0 topics
#freq.info()
# Filter rows where 'Topic' is equal to -1
filtered_df = freq[freq['Topic'] == -1]
pd.set_option('display.max_colwidth', None)
print(filtered_df['Representative_Docs'])

0    [Quality life associated factors among youth substance use Northwest Ethiopia: Using structural equation modeling, Training spiking neuronal network model visual-motor cortex play virtual racket-ball game using reinforcement learning, association disordered eating health-related quality life among children adolescents: systematic review population-based studies]
Name: Representative_Docs, dtype: object


In [10]:
#remove stopwords
topic_model.get_topic(0)

[('covid19', 0.09156264287658607),
 ('pandemic', 0.058357277356814216),
 ('health', 0.02872718673942675),
 ('mental', 0.023546152663434246),
 ('crosssectional', 0.020759867517955565),
 ('ethiopia', 0.01854502618712158),
 ('among', 0.0181721462492378),
 ('knowledge', 0.017488218815033155),
 ('infection', 0.01700636844453755),
 ('practices', 0.016859114715565074)]

In [44]:
topic_model.visualize_hierarchy(top_n_topics=20)

In [48]:
#topic_model.visualize_topics(top_n_topics=10)
topic_model.visualize_barchart(top_n_topics=10)

In [12]:
#topic_model.reduce_topics(docs, nr_topics=10)
#https://www.vennify.ai/bertopic-topic-modeling/
predict new topic

ValueError: All arrays must be of the same length

In [None]:
#https://maartengr.github.io/BERTopic/faq.html#why-does-it-take-so-long-to-import-bertopic

In [None]:
topic_model.visualize_hierarchy(top_n_topics=10)

# New Section