In [40]:
import numpy as np
import pandas as pd
from pathlib import Path
import nltk
from nltk.tokenize import word_tokenize
import spacy
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
# https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html#visualizations

In [2]:
df = pd.read_csv('processedraw_data.csv')

In [3]:
documents = df['raw_processed']

In [6]:
from sentence_transformers import SentenceTransformer

# loading sentence transformer model
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

  from tqdm.autonotebook import tqdm, trange


In [7]:
from umap import UMAP

# dimensionality reduction
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine'
)

reduced_embeddings = umap_model.fit_transform(embeddings)

In [16]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=30,
    min_samples=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=False
)

In [17]:
from bertopic import BERTopic
from keybert import KeyBERT

In [20]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

keybert = KeyBERTInspired()
mmr = MaximalMarginalRelevance(diversity=0.5)

representation_model = {
    "KeyBERT": keybert,
    "MMR": mmr
}

In [23]:
# KeyBERT for keyword extraction
kw_model = KeyBERT(model=embedding_model)

# BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,

    calculate_probabilities=False,
    top_n_words=10,
    nr_topics='auto',
    verbose=True
)


In [24]:
topics, probabilities = topic_model.fit_transform(documents)

2024-10-17 18:56:44,907 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2024-10-17 19:01:30,548 - BERTopic - Embedding - Completed ✓
2024-10-17 19:01:30,548 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-17 19:01:35,300 - BERTopic - Dimensionality - Completed ✓
2024-10-17 19:01:35,300 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-17 19:01:35,345 - BERTopic - Cluster - Completed ✓
2024-10-17 19:01:35,346 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-17 19:02:02,058 - BERTopic - Representation - Completed ✓
2024-10-17 19:02:02,059 - BERTopic - Topic reduction - Reducing number of topics
2024-10-17 19:21:57,075 - BERTopic - Topic reduction - Reduced number of topics from 9 to 9


In [25]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,154,-1_the_drawer_val_to,"[the, drawer, val, to, in, openrndr, it, and, ...","[openrndr, kotlin, vector2, texelsize, texture...","[drawer, openrndr, and, import, width, nimport...",[('I wanted to see if I could make something d...
1,0,376,0_openrndr_gradle_jar_files,"[openrndr, gradle, jar, files, modules, caches...","[openrndr, kotlin, openal, gradle, intellij, a...","[openrndr, gradle, jar, modules, caches, org, ...","[(""I recently did a 30 minute session for Gith..."
2,1,278,1_the_to_you_it,"[the, to, you, it, for, in, fun, and, var, https]","[audiorenderer, audio, visuals, program, code,...","[for, var, https, al10, buffersize, null, is, ...","[(""For me it's always something in progress. D..."
3,2,146,2_the_to_in_buffer,"[the, to, in, buffer, and, double, it, vec2, o...","[shader, shaders, computeshader, openrndr, com...","[to, buffer, vec2, https, openrndr, float, upl...",[('This is a post inspired by a conversation w...
4,3,107,3_drawer_org_openrndr_colorrgba,"[drawer, org, openrndr, colorrgba, val, the, m...","[drawer, kotlinx, kotlin, draw, kotlinjsr223jv...","[drawer, openrndr, mantas, kotlin, application...",[('After simplifying the program to this ``` i...
5,4,56,4_drawer_val_org_openrndr,"[drawer, val, org, openrndr, shape, the, circl...","[shapecontour, contours, shapes, contour, svg,...","[drawer, org, openrndr, circle, svg, contour, ...",[('## 9. Hatching Here goes a very basic examp...
6,5,53,5_val_vector3_org_openrndr,"[val, vector3, org, openrndr, drawer, vertexbu...","[vertexbuffer, vertexformat, vertices, vertex,...","[vector3, openrndr, drawer, vertexbuffer, colo...",[('I just wrote a simple program drawing 300 t...
7,6,51,6_video_the_to_it,"[video, the, to, it, is, program, videoplayer,...","[videowriter, ffmpeg, videoplayerffmpeg, openr...","[video, videoplayer, screenshots, and, frame, ...","[('Hi, I thought it might be cool to be able t..."
8,7,34,7_div_upload_style_jpeg,"[div, upload, style, jpeg, border, 500x500, fl...","[png, jpeg, div, 14px, transform, 50px, 8px, c...","[div, upload, style, jpeg, border, 500x500, fl...",[('![apps2023.Wobble2-2023-03-17-22.49.05|500x...


In [32]:
topic_model.get_topic(1, full=True)['KeyBERT']

[('audiorenderer', 0.40667045),
 ('audio', 0.34634003),
 ('visuals', 0.3202147),
 ('program', 0.2940559),
 ('code', 0.26761162),
 ('pitch', 0.24674153),
 ('fftdivide', 0.24099207),
 ('samples', 0.20508073),
 ('vorbistrack', 0.19205865),
 ('how', 0.1916078)]

In [42]:
import spacy

In [43]:
nlp = spacy.load("en_core_web_sm")
tokenized_documents = [[token.text for token in nlp(doc)] for doc in documents]

In [46]:
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

dictionary = corpora.Dictionary(tokenized_documents)
topics_ = topic_model.get_topics()
top_words = [[word for word, _ in topic[:10]] for topic in topics_.values()]

coherence_model = CoherenceModel(
    topics=top_words,
    texts=tokenized_documents,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
coherence_score_per_topic = coherence_model.get_coherence_per_topic()

In [48]:
coherence_score_per_topic
# high coherence = more interpretable and human-like topics
# low coherence = less interpretable, possibly arbitrary topics

[0.4508482900409277,
 0.41964053306293553,
 0.5889594728888626,
 0.5668361225250242,
 0.2711547820269795,
 0.3342403084180756,
 0.32631314987727034,
 0.6630681720333291,
 0.6246774735372693]

In [49]:
def calculate_topic_diversity(topics, top_n_words=10):
    # flatten all top words across topics
    top_words = [word for topic in topics for word, _ in topic[:top_n_words]]
    # calculate the unique words ratio
    unique_words = len(set(top_words))
    total_words = len(top_words)
    return unique_words / total_words

topic_diversity = calculate_topic_diversity(list(topics_.values()), top_n_words=10)

In [50]:
topic_diversity

0.5666666666666667

In [51]:
topic_model.visualize_topics()

In [52]:
embeddings = embedding_model.encode(documents, show_progress_bar=True)

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

In [53]:
reduced_embeddings = umap_model.fit_transform(embeddings)

In [54]:
topic_model.visualize_documents(documents, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)

In [56]:
topic_model.visualize_heatmap()

In [75]:
# Clean your text data (if you haven't done so)
df['raw_cleaned'] = df['raw'].apply(lambda row: re.sub(r"http\S+", "", row).lower())  # Remove URLs
df['raw_cleaned'] = df['raw_cleaned'].apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row.split())))  # Remove mentions
df['raw_cleaned'] = df['raw_cleaned'].apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row).split()))  # Keep only letters

In [86]:
df['created_at'] = pd.to_datetime(df['created_at'], format='ISO8601')
df['created_at_year'] = df['created_at'].dt.to_period('Y').dt.to_timestamp()  # Aggregating by year
topics_over_time = topic_model.topics_over_time(docs_list, df['created_at_year'])


6it [00:00, 14.71it/s]


In [87]:
topic_model.visualize_topics_over_time(topics_over_time)

In [88]:
topics_per_class = topic_model.topics_per_class(documents, classes=df['category_id'])

14it [00:00, 17.67it/s]


In [89]:
topic_model.visualize_topics_per_class(topics_per_class)