In [1]:
import pandas as pd
import plotly.express as px
from umap.umap_ import UMAP
import hdbscan
from transformers import AutoTokenizer, AutoModel


class TopicModelingPipeline:
    def __init__(self, df):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.model = AutoModel.from_pretrained("bert-base-uncased")

    def generate_umap_embeddings(self, n_components=5, n_neighbors=15, metric='cosine', min_dist=0.1):
        # Generate BERT embeddings for the cleaned text column
        embeddings = []
        for text in self.df['cleaned_text']:
            inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            outputs = self.model(**inputs)[1].detach().numpy()
            embeddings.append(outputs)

        # Perform dimensionality reduction with UMAP
        umap_embeddings = UMAP(
            n_components=n_components,
            n_neighbors=n_neighbors,
            metric=metric,
            min_dist=min_dist
        ).fit_transform(embeddings)

        # Visualize the UMAP embeddings
        fig = px.scatter(x=umap_embeddings[:,0], y=umap_embeddings[:,1])
        fig.show()

        return umap_embeddings

    def generate_hdbscan_clusters(self, umap_embeddings, min_cluster_size=10, min_samples=1, cluster_selection_epsilon=0.5):
        # Generate topic clusters with HDBSCAN
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_epsilon=cluster_selection_epsilon
        ).fit(umap_embeddings)

        return clusterer.labels_

    def visualize_clusters(self, umap_embeddings, cluster_labels):
        # Visualize the HDBSCAN clusters against the UMAP embeddings
        fig = px.scatter(
            x=umap_embeddings[:,0],
            y=umap_embeddings[:,1],
            color=cluster_labels,
            hover_data=[self.df['cleaned_text']]
        )
        fig.show()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import sys

notebook_dir = os.path.dirname(os.path.abspath("__file__"))
project_dir = os.path.dirname(notebook_dir)

if project_dir not in sys.path:
    sys.path.append(project_dir)


In [4]:
df = pd.read_csv(r"C:\Users\johna\anaconda3\envs\twitter-analytics-env\twitter_issues_dashboard\twitter_issues_dashboard\data\01_raw\tweets_details2023-03-15_20-43-36.csv")

In [5]:
# clean text 
from data_processing.preprocess_tweets import TextCleaner

cleaner = TextCleaner()
df['cleaned_text'] = df['text'].apply(cleaner.clean_text)

In [6]:
topicmodel = TopicModelingPipeline(df)

Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 4.00kB/s]
Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████| 570/570 [00:00<00:00, 142kB/s]
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 232k/232k [00:00<00:00, 912kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████████████████████████████████████| 466k/466k [00:00<00:00, 1.38MB/s]
Downloading pytorch_model.bin: 100%|████████████████████████████████████████████████| 440M/440M [00:27<00:00, 16.3MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This I

In [None]:
embeddings = topicmodel.generate_umap_embeddings()