In [None]:
!pip install berteley
!pip install --upgrade scikit-learn

# Runtime > Restart Runtime

In [None]:
from berteley import models
from berteley import preprocessing
import pandas as pd

#progress bar for jupyter notebooks
from alive_progress import config_handler
config_handler.set_global(force_tty=True)

# Retrieving the data

Here we have a dataset consisting of just under 135 scientific articles published by one of our co-authors Dr. Daniela Ushizima. With this corpus and BERTeley we can try to get a good summary of the work she's published throughout her career.

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

If the cell below does not work, uncomment and run the cell above

In [None]:
#url = 'https://media.githubusercontent.com/media/dani-lbnl/IDEAL/master/data/computervision_springer_articles.csv'
url = 'https://raw.githubusercontent.com/dani-lbnl/IDEAL/master/data/dani_english_filtered_articles.csv'

In [None]:
articles = pd.read_csv(url)

articles.head

<bound method NDFrame.head of                                                  title  \
0          A texture approach to leukocyte recognition   
1    Evaluation of three algorithms for the segment...   
2    Xi-cam: a versatile interface for data visuali...   
3    Segmentation of subcellular compartments combi...   
4    SAR imagery segmentation by statistical region...   
..                                                 ...   
130  Tracking cell dynamics from time-lapse LSM ima...   
131  Quantitative Microscopy Applied to Cytology an...   
132  CO2 Sequestion and Storage: From Raw Micro-CT ...   
133  CHARACTERIZATION OF MRI BRAIN SCANS ASSOCIATED...   
134         Front propagation using fast marching in R   

                                                  link  \
0    https://scholar.google.com/citations?view_op=v...   
1    https://scholar.google.com/citations?view_op=v...   
2    https://scholar.google.com/citations?view_op=v...   
3    https://scholar.google.com/citations

In [None]:
df = articles[["title", "description"]]

# remove rows with NA values
df = df.dropna()
print("- Number of records: "+str(len(df)))
print("- Size in memory (bytes): "+str(df.memory_usage()))

- Number of records: 135
- Size in memory (bytes): Index           128
title          1080
description    1080
dtype: int64


In [None]:
#Concatenate title+abstract into a single record
titles = df["title"].to_list()
abstracts = df["description"].to_list()

raw_text = [t + " " + a for t,a in zip(titles, abstracts)]

# Preprocessing



In [None]:
clean_docs = preprocessing.preprocess(raw_text, allow_abbrev=True, show_progress=True)

Removing short strings |████████████████████████████████████████| 10/10 [100%] i


# Topic Modeling

In [None]:
?models.fit

In [None]:
topics, probabilities, topic_sizes, topic_model, topic_words, metrics = models.fit(
    clean_docs,
    embedding_model="scibert",
    n_gram_range="unigram",
    verbose=True)

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2023-10-23 19:48:29,307 - BERTopic - Transformed documents to Embeddings
2023-10-23 19:48:31,434 - BERTopic - Reduced dimensionality
2023-10-23 19:48:31,444 - BERTopic - Clustered reduced embeddings


In [None]:
metrics

{'Coherence': 0.41144072951361627, 'Diversity': 1.0}

In [None]:
models.create_barcharts(topics, topic_model)