In [35]:
## Libraries 
import numpy as np
import pandas as pd
import psycopg2
import torch
from bertopic import BERTopic
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from scipy.cluster import hierarchy as sch
import pickle
import ast
import re
import random
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

## DATA PREPARATION AND PREPROCESSING

In [8]:
# Load Data 
# Load TDM data
df1 = pd.read_csv("####.csv")
# Select full_text column and convert to list
tdm_docs = df1["full_text"].drop_duplicates().to_list()

In [10]:
# Load WCS data
df3 = pd.read_csv("####.csv")
# Select full_text column and convert to list
wcs_docs = df3["Body"].to_list()

In [11]:
# Loading Reddit data and creating dataframe
# Connect to SupaBase database of reddit tweets
conn = psycopg2.connect(
    dbname="####",
    user="#####",
    password="####",
    host="####",
    port="####"
    )

# Create a cursor
cur = conn.cursor()

# Define the SQL query
query = """
SELECT 
    rs.*, 
    COALESCE(rs.text || ' ' || STRING_AGG(rc.body, ' '), rs.text) AS full_text
FROM 
    reddit_submissions rs
LEFT JOIN 
    reddit_comments rc
ON 
    rs.id = rc.modified_link_id
GROUP BY 
    rs.id;
"""

# Execute the query
# Use pandas to execute the query and load the data into a DataFrame
df2 = pd.read_sql(query, conn)

# Close the connection
conn.close()

# Remove null and na values
df2_notnull = df2[df2["full_text"].notnull()]
df2_notna = df2_notnull[df2_notnull["full_text"].notna()]

# Remove empty cells
df2_nonempty = df2_notna.loc[df2_notna["full_text"] != ""]

# Remove tabs, newlines and multiple spaces
df2_nonempty['full_text'] = df2_nonempty['full_text'].str.replace(r'\s+', ' ', regex=True)

# Create list of reddit documents
reddit_docs = df2_nonempty["full_text"].to_list()

In [13]:
# Keywords for relevance
keywords = ['wildfire', 'forest fire', 'wildland fire', 'firestorm', 'ember attack', 'fire behavior', 'firebreak', 'fire line', 'backfire', 'controlled burn', 'prescribed fire', 'fuel reduction', 'defensible space', 'fire danger', 'fire risk', 'fire weather', 'fire suppression', 'fire management', 'fire containment', 'fire perimeter', 'hotshot crew', 'smokejumper', 'firefighter', 'fire crew', 'incident commander', 'incident management team', 'fire retardant', 'helitack', 'airtanker', 'fire lookout', 'fire tower', 'burn scar', 'burned area emergency response', 'fire ecology', 'crosscut saw', 'pulaski tool', 'fire shelter', 'fire behavior analyst', 'ignition source', 'extinguishment', 'flame length', 'spot fire', 'ground fire', 'crown fire', 'fire whirl', 'topography', 'wind direction', 'wind speed', 'relative humidity', 'temperature', 'drought', 'vegetation type', 'dead fuel', 'live fuel moisture', 'fire risk assessment', 'fire danger rating', 'fire spread', 'fire behavior prediction', 'ignition probability', 'evacuation', 'shelter in place', 'emergency response', 'incident command post', 'incident base', 'hotspot', 'mop-up', 'burnout', 'wildfire adaptation', 'smoke management', 'fire ecology', 'fire effects', 'fire history', 'fire regime', 'firewise', 'fire-adapted communities', 'community wildfire protection plan', 'land management planning', 'resource management', 'fire prevention', 'fire investigation', 'fire detection', 'early warning systems', 'fire risk communication', 'fire education', 'firewise landscaping', 'ember-resistant construction', 'fire-resistant materials', 'fireproofing', 'fuel reduction projects', 'mechanical thinning', 'prescribed burning', 'vegetation management', 'controlled burns', 'indigenous land management practices', 'fire-adapted ecosystems', 'fire-adapted species', 'fire-dependent ecosystems', 'ecological resilience', 'post-fire recovery', 'regeneration', 'salvage logging', 'invasive species management', 'erosion control', 'watershed protection', 'riparian restoration', 'habitat restoration', 'reforestation', 'regrowth', 'soil stabilization', 'burned area rehabilitation', 'hydrophobic soil', 'fire-induced erosion', 'seeding', 'mulching', 'contour felling', 'silt fencing', 'check dams', 'straw wattles', 'culvert cleaning', 'debris removal', 'channel stabilization', 'water bars', 'hazard tree removal', 'barrier installation', 'erosion matting', 'streambank stabilization', 'riprap installation', 'bioengineering', 'fuel break construction', 'vegetative barriers', 'strategic fuel breaks', 'fuel ladder management', 'thinning projects', 'brush clearing', 'prescribed grazing', 'livestock management', 'green infrastructure', 'urban interface management', 'firewise landscaping', 'ember-resistant roofing', 'fire-safe construction', 'defensible space creation', 'safe zones', 'firewise community design', 'community preparedness', 'evacuation planning', 'emergency response coordination', 'mutual aid agreements', 'resource sharing', 'incident command structure', 'joint information center (jic)', 'agency cooperation', 'interagency coordination', 'incident action plan', 'incident objectives', 'resources ordering', 'logistics support', 'air operations', 'ground operations', 'fire behavior analysis', 'fire modeling', 'weather forecasting', 'fire danger predictions', 'fire spread modeling', 'fire behavior simulations', 'fire history analysis', 'burn severity mapping', 'remote sensing', 'aerial reconnaissance', 'satellite imagery', 'unmanned aerial vehicles (uavs)', 'fire detection systems', 'early warning technology', 'fire lookout towers', 'fire cameras', 'lightning detection systems', 'fire meteorology', 'fire danger rating systems', 'fire danger indices', 'fire risk assessment tools', 'fire behavior prediction models', 'fire spread algorithms', 'fire suppression tactics', 'structure protection', 'fire line construction', 'fire hose deployment', 'water supply', 'pump operations', 'hose lays', 'sprinkler systems', 'foam application', 'retardant drops', 'back burning', 'flanking operations', 'parallel attack', 'direct attack', 'indirect attack', 'burnout operations', 'firing operations', 'holding actions', 'control lines', 'anchor points', 'escape routes', 'safety zones', 'lookouts', 'communication protocols', 'firefighter safety', 'incident safety', 'incident stress management', 'personal protective equipment (ppe)', 'fire shelters', 'heat stress prevention', 'medivac procedures', 'fire behavior training', 'incident command training', 'fire suppression training', 'equipment maintenance', 'fuel management', 'vehicle maintenance', 'resource management', 'budgeting', 'grant funding', 'community outreach', 'public information', 'media relations', 'education programs', 'training exercises', 'tabletop drills', 'field simulations', 'after-action reviews', 'lessons learned', 'incident debriefing', 'incident documentation', 'data collection', 'reporting systems', 'performance metrics', 'incident critique', 'incident analysis', 'risk assessment', 'incident review', 'resource evaluation', 'resource allocation', 'mutual aid agreements', 'interagency cooperation', 'incident command structure', 'incident objectives', 'strategic planning', 'operational planning', 'tactical planning', 'incident response planning', 'wildfire', 'fuels treatment', 'prescribed burn', 'controlled burn', 'firewise', 'wui', 'rx fire']

In [14]:
# Get relevant documents 
rel_tdm_docs = [doc for doc in tdm_docs if sum(keyword in doc.lower() for keyword in keywords) >=2]
rel_reddit_docs = [doc for doc in reddit_docs if sum(keyword in doc.lower() for keyword in keywords) >=2]
rel_wcs_docs = [doc for doc in wcs_docs if sum(keyword in doc.lower() for keyword in keywords) >=2]

In [16]:
## Split into sentences, paragraphs, docs 

# Function for splitting sentences into specified chunk size
def chunk_sentences(text, chunk_size=5):
    # Tokenize the document into sentences
    sentences = sent_tokenize(text)
    
    # Group sentences into chunks of `chunk_size`
    chunks = [' '.join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]
    
    return chunks


In [17]:
# Tokenize by sentence 
from nltk.tokenize import sent_tokenize
tdm_sentences = [sent_tokenize(doc) for doc in rel_tdm_docs]
tdm_sentences = [sentence for doc in tdm_sentences for sentence in doc]
reddit_sentences = [sent_tokenize(doc) for doc in rel_reddit_docs]
reddit_sentences = [sentence for doc in reddit_sentences for sentence in doc]
wcs_sentences = [sent_tokenize(doc) for doc in rel_wcs_docs]
wcs_sentences = [sentence for doc in wcs_sentences for sentence in doc]

In [18]:
# Tokenize by 5 sentences 
tdm_para = [chunk_sentences(doc, chunk_size=5) for doc in rel_tdm_docs]
tdm_para = [sentence for doc in tdm_para for sentence in doc]
reddit_para = [chunk_sentences(doc, chunk_size=5) for doc in rel_reddit_docs]
reddit_para = [sentence for doc in reddit_para for sentence in doc]
wcs_para = [chunk_sentences(doc, chunk_size=5) for doc in rel_wcs_docs]
wcs_para = [sentence for doc in wcs_para for sentence in doc]

In [23]:

# Function for cleaning chunks
def clean_text(text):
    # Remove punctuation and quotations
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text



In [25]:

# Combine all sentences
all_sentences = tdm_sentences + reddit_sentences + wcs_sentences

# Combine all paragraphs 
all_para = tdm_para + reddit_para + wcs_para

# Combine all docs 
all_docs = rel_tdm_docs + rel_reddit_docs + rel_wcs_docs


In [26]:
# Clean all sentences
all_sentences = [clean_text(sent) for sent in all_sentences]

# Clean all paragraphs 
all_para = [clean_text(para) for para in all_para]



In [132]:
# Clean all docs 
all_docs = [clean_text(doc) for doc in all_docs]

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(document):
    # Tokenize the document into words
    words = word_tokenize(document)
    # Filter out the stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a string
    filtered_document = ' '.join(filtered_words)
    return filtered_document

all_docs = [remove_stopwords(doc) for doc in all_docs]

In [28]:
# Pre-calculate embeddings
sent_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
para_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Calculate embeddings
sent_embeddings = sent_embedding_model.encode(all_sentences, show_progress_bar=True)
para_embeddings = para_embedding_model.encode(all_para, show_progress_bar=True)


Batches:   0%|          | 0/10971 [00:00<?, ?it/s]

Batches:   0%|          | 0/2248 [00:00<?, ?it/s]

Batches:   0%|          | 0/136 [00:00<?, ?it/s]

In [134]:
# Document level embedding space
doc_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
doc_embeddings = doc_embedding_model.encode(all_docs, show_progress_bar=True)

Batches:   0%|          | 0/136 [00:00<?, ?it/s]

In [29]:
# Clean sentence level
# Clean all sentences
tdm_sentences = [clean_text(sent) for sent in tdm_sentences]
reddit_sentences = [clean_text(sent) for sent in reddit_sentences]
wcs_sentences = [clean_text(sent) for sent in wcs_sentences]

# Clean paragraph level
tdm_para = [clean_text(para) for para in tdm_para]
reddit_para = [clean_text(para) for para in reddit_para]
wcs_para = [clean_text(para) for para in wcs_para]



In [148]:
# Clean document level 
tdm_docs = [clean_text(doc) for doc in rel_tdm_docs]
reddit_docs = [clean_text(doc) for doc in rel_reddit_docs]
wcs_docs = [clean_text(doc) for doc in rel_wcs_docs]

## MODEL IMPLEMENTATION

In [36]:
# Create consistent model
umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Control the number of topics
sent_hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
para_hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
doc_hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Improve default representation
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))

In [37]:
# Set up representation models
# KeyBERT
keybert_model = KeyBERTInspired()

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
# Credentials 
open_ai_key = "####"
project_id = "####"
# Clients 
client = openai.OpenAI(api_key=open_ai_key, project=project_id)
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""

openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model, 
    "MMR": mmr_model,
}

#### SENTENCE MODELS

In [42]:
# Topic Model TDM
sent_model_1 = BERTopic(

  # Pipeline models
  embedding_model=sent_embedding_model,
  umap_model=umap_model,
  hdbscan_model=sent_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Topic Model Reddit
sent_model_2 = BERTopic(

  # Pipeline models
  embedding_model=sent_embedding_model,
  umap_model=umap_model,
  hdbscan_model=sent_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Topic Model WCS
sent_model_3 = BERTopic(

  # Pipeline models
  embedding_model=sent_embedding_model,
  umap_model=umap_model,
  hdbscan_model=sent_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Fit topic model 
sent_tdm_topics, sent_tdm_probs = sent_model_1.fit_transform(tdm_sentences)
sent_reddit_topics, sent_reddit_probs = sent_model_2.fit_transform(reddit_sentences)
sent_wcs_topics, sent_wcs_probs = sent_model_3.fit_transform(wcs_sentences)

2024-07-14 11:46:17,514 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4381 [00:00<?, ?it/s]

2024-07-14 11:49:25,919 - BERTopic - Embedding - Completed ✓
2024-07-14 11:49:25,920 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-14 11:51:44,879 - BERTopic - Dimensionality - Completed ✓
2024-07-14 11:51:44,883 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Batches:   0%|          | 0/6449 [00:00<?, ?it/s]

2024-07-14 12:01:40,256 - BERTopic - Embedding - Completed ✓
2024-07-14 12:01:40,257 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-14 12:05:32,968 - BERTopic - Dimensionality - Completed ✓
2024-07-14 12:05:32,971 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Batches:   0%|          | 0/143 [00:00<?, ?it/s]

2024-07-14 12:13:02,593 - BERTopic - Embedding - Completed ✓
2024-07-14 12:13:02,593 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-14 12:13:08,556 - BERTopic - Dimensionality - Completed ✓
2024-07-14 12:13:08,557 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-14 12:13:08,758 - BERTopic - Cluster - Completed ✓
2024-07-14 12:13:08,761 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 4/4 [00:02<00:00,  1.57it/s]
2024-07-14 12:13:17,547 - BERTopic - Representation - Completed ✓


#### PARAGRAPH MODELS

In [43]:
# Topic Model TDM
para_model_1 = BERTopic(

  # Pipeline models
  embedding_model=para_embedding_model,
  umap_model=umap_model,
  hdbscan_model=para_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Topic Model Reddit
para_model_2 = BERTopic(

  # Pipeline models
  embedding_model=para_embedding_model,
  umap_model=umap_model,
  hdbscan_model=para_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Topic Model WCS
para_model_3 = BERTopic(

  # Pipeline models
  embedding_model=para_embedding_model,
  umap_model=umap_model,
  hdbscan_model=para_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Fit topic model 
para_tdm_topics, para_tdm_probs = para_model_1.fit_transform(tdm_para)
para_reddit_topics, para_reddit_probs = para_model_2.fit_transform(reddit_para)
para_wcs_topics, para_wcs_probs = para_model_3.fit_transform(wcs_para)

2024-07-14 12:13:17,657 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/910 [00:00<?, ?it/s]

2024-07-14 12:15:03,358 - BERTopic - Embedding - Completed ✓
2024-07-14 12:15:03,358 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-14 12:15:17,050 - BERTopic - Dimensionality - Completed ✓
2024-07-14 12:15:17,051 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Batches:   0%|          | 0/1308 [00:00<?, ?it/s]

2024-07-14 12:21:25,547 - BERTopic - Embedding - Completed ✓
2024-07-14 12:21:25,548 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-14 12:21:45,200 - BERTopic - Dimensionality - Completed ✓
2024-07-14 12:21:45,202 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

2024-07-14 12:25:16,471 - BERTopic - Embedding - Completed ✓
2024-07-14 12:25:16,471 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-14 12:25:20,857 - BERTopic - Dimensionality - Completed ✓
2024-07-14 12:25:20,858 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-14 12:25:20,885 - BERTopic - Cluster - Completed ✓
2024-07-14 12:25:20,888 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 9/9 [00:04<00:00,  1.86it/s]
2024-07-14 12:25:31,864 - BERTopic - Representation - Completed ✓


#### DOCUMENT MODELS

In [164]:
def chunk_document(document, char_limit=15000):
    """
    Chunk a document by a character limit, approximating a token limit.
    """
    chunks = []
    current_index = 0
    while current_index < len(document):
        # Ensure the chunk does not exceed the document's length
        end_index = min(current_index + char_limit, len(document))
        chunks.append(document[current_index:end_index])
        current_index += char_limit
    return chunks

In [165]:
# Apply chunk_document to each document due to OpenAI limits
tdm_docs = [chunk for doc in tdm_docs for chunk in chunk_document(doc)]
reddit_docs = [chunk for doc in reddit_docs for chunk in chunk_document(doc)]
wcs_docs = [chunk for doc in wcs_docs for chunk in chunk_document(doc)]

In [166]:
# Topic Model TDM
doc_model_1 = BERTopic(

  # Pipeline models
  embedding_model=doc_embedding_model,
  umap_model=umap_model,
  hdbscan_model=doc_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Topic Model Reddit
doc_model_2 = BERTopic(

  # Pipeline models
  embedding_model=doc_embedding_model,
  umap_model=umap_model,
  hdbscan_model=doc_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Topic Model WCS
doc_model_3 = BERTopic(

  # Pipeline models
  embedding_model=doc_embedding_model,
  umap_model=umap_model,
  hdbscan_model=doc_hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Fit topic model 
doc_tdm_topics, doc_tdm_probs = doc_model_1.fit_transform(tdm_docs)
doc_reddit_topics, doc_reddit_probs = doc_model_2.fit_transform(reddit_docs)
doc_wcs_topics, doc_wcs_probs = doc_model_3.fit_transform(wcs_docs)

2024-07-15 13:25:08,905 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/106 [00:00<?, ?it/s]

2024-07-15 13:25:37,564 - BERTopic - Embedding - Completed ✓
2024-07-15 13:25:37,564 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-15 13:25:49,353 - BERTopic - Dimensionality - Completed ✓
2024-07-15 13:25:49,354 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-15 13:25:49,488 - BERTopic - Cluster - Completed ✓
2024-07-15 13:25:49,492 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 84/84 [02:32<00:00,  1.82s/it]
2024-07-15 13:28:59,777 - BERTopic - Representation - Completed ✓
2024-07-15 13:29:03,691 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

2024-07-15 13:29:30,623 - BERTopic - Embedding - Completed ✓
2024-07-15 13:29:30,624 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-15 13:29:40,348 - BERTopic - Dimensionality - Completed ✓
2024-07-15 13:29:40,349 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-15 13:29:40,440 - BERTopic - Cluster - Completed ✓
2024-07-15 13:29:40,442 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 63/63 [03:14<00:00,  3.09s/it]
2024-07-15 13:33:23,368 - BERTopic - Representation - Completed ✓
2024-07-15 13:33:26,408 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

2024-07-15 13:33:29,508 - BERTopic - Embedding - Completed ✓
2024-07-15 13:33:29,509 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-15 13:33:30,296 - BERTopic - Dimensionality - Completed ✓
2024-07-15 13:33:30,296 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-15 13:33:30,304 - BERTopic - Cluster - Completed ✓
2024-07-15 13:33:30,306 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 3/3 [00:02<00:00,  1.28it/s]
2024-07-15 13:33:38,204 - BERTopic - Representation - Completed ✓


#### POSTPROCESSING

In [45]:
# Sentence Labels

# Editing representations to indicate data source
sent_models = {
    "sent_model_1": ("TDM", sent_model_1),
    "sent_model_2": ("Reddit", sent_model_2),
    "sent_model_3": ("WCS", sent_model_3)
}

new_labels = []
for model_name, (tag, model) in sent_models.items():
    # Retrieve current topic labels
    current_labels = model.get_topic_info()['OpenAI'].to_list()
    # Prepend the tag to each label
    new_labels = ({i: f"{tag} - {label}" for i, label in enumerate(current_labels, start=-1)}) # Assuming topic IDs start from -1
    
    # Update the topic model with new labels
    model.set_topic_labels(new_labels)


# Setup labels for merged model
sent_initial = ["Outlier topic"]

# Get topics from each model, skipping the outlier topic 
sent_topics_1 = [f"{i} {topic}" for i, topic in enumerate(sent_model_1.get_topic_info()['CustomName'].to_list()[1:], start=0)]
sent_topics_2 = [f"{i+len(sent_topics_1)} {topic}" for i, topic in enumerate(sent_model_2.get_topic_info()['CustomName'].to_list()[1:], start=0)]
sent_topics_3 = [f"{i+len(sent_topics_1)+len(sent_topics_2)} {topic}" for i, topic in enumerate(sent_model_3.get_topic_info()['CustomName'].to_list()[1:], start=0)]

# Combine all labels 
sent_labels = [f"-1 {sent_initial[0]}"] + sent_topics_1 + sent_topics_2 + sent_topics_3

In [186]:
# Paragraph Labels

# Editing representations to indicate data source
para_models = {
    "para_model_1": ("TDM", para_model_1),
    "para_model_2": ("Reddit", para_model_2),
    "para_model_3": ("WCS", para_model_3)
}

para_labels = []
for model_name, (tag, model) in para_models.items():
    # Retrieve current topic labels
    current_labels = model.get_topic_info()['OpenAI'].to_list()
    # Prepend the tag to each label
    para_labels = ({i: f"{tag} - {label}" for i, label in enumerate(current_labels, start=-1)}) # Assuming topic IDs start from -1
    
    # Update the topic model with new labels
    model.set_topic_labels(para_labels)


# Setup labels for merged model
para_initial = ["Outlier topic"]

# Get topics from each model, skipping the outlier topic 
para_topics_1 = [f"{i} {topic}" for i, topic in enumerate(para_model_1.get_topic_info()['CustomName'].to_list()[1:], start=0)]
para_topics_2 = [f"{i+len(para_topics_1)} {topic}" for i, topic in enumerate(para_model_2.get_topic_info()['CustomName'].to_list()[1:], start=0)]
para_topics_3 = [f"{i+len(para_topics_1)+len(para_topics_2)} {topic}" for i, topic in enumerate(para_model_3.get_topic_info()['CustomName'].to_list()[1:], start=0)]

# Combine all labels 
para_labels = [f"-1 {para_initial[0]}"] + para_topics_1 + para_topics_2 + para_topics_3

In [168]:
# Document Labels

# Editing representations to indicate data source
doc_models = {
    "doc_model_1": ("TDM", doc_model_1),
    "doc_model_2": ("Reddit", doc_model_2),
    "doc_model_3": ("WCS", doc_model_3)
}

doc_labels = []
for model_name, (tag, model) in doc_models.items():
    # Retrieve current topic labels
    current_labels = model.get_topic_info()['OpenAI'].to_list()
    # Prepend the tag to each label
    doc_labels = ({i: f"{tag} - {label}" for i, label in enumerate(current_labels, start=-1)}) # Assuming topic IDs start from -1
    
    # Update the topic model with new labels
    model.set_topic_labels(doc_labels)


# Setup labels for merged model
doc_initial = ["Outlier topic"]

# Get topics from each model, skipping the outlier topic 
doc_topics_1 = [f"{i} {topic}" for i, topic in enumerate(doc_model_1.get_topic_info()['CustomName'].to_list()[1:], start=0)]
doc_topics_2 = [f"{i+len(doc_topics_1)} {topic}" for i, topic in enumerate(doc_model_2.get_topic_info()['CustomName'].to_list()[1:], start=0)]
doc_topics_3 = [f"{i+len(doc_topics_1)+len(doc_topics_2)} {topic}" for i, topic in enumerate(doc_model_3.get_topic_info()['CustomName'].to_list()[1:], start=0)]

# Combine all labels 
doc_labels = [f"-1 {doc_initial[0]}"] + doc_topics_1 + doc_topics_2 + doc_topics_3

#### SAVING MODELS

In [49]:
# Save sentence models 
from bertopic.backend import OpenAIBackend

embedding_model = OpenAIBackend(client, "gpt-3.5-turbo")
sent_model_1.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=sent_embedding_model)
sent_model_2.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=sent_embedding_model)
sent_model_3.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=sent_embedding_model)

In [50]:
# Save paragraph models 
from bertopic.backend import OpenAIBackend

embedding_model = OpenAIBackend(client, "gpt-3.5-turbo")
para_model_1.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=para_embedding_model)
para_model_2.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=para_embedding_model)
para_model_3.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=para_embedding_model)

In [169]:
# Save document models
embedding_model = OpenAIBackend(client, "gpt-3.5-turbo")
doc_model_1.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=doc_embedding_model)
doc_model_2.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=doc_embedding_model)
doc_model_3.save("####", serialization="pytorch", save_ctfidf=True, save_embedding_model=doc_embedding_model)

#### SAVING REPRESENTATIVE DOCS FOR THE DATAFRAME

In [53]:
# Setup representative docs for merged model
sent_initial_rep_doc = ["Outlier documents"]

# Get topics from each model, skipping the outlier topic 
sent_topics_1_rep_doc = [f"{i} {topic}" for i, topic in enumerate(sent_model_1.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]
sent_topics_2_rep_doc = [f"{i+len(sent_topics_1)} {topic}" for i, topic in enumerate(sent_model_2.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]
sent_topics_3_rep_doc = [f"{i+len(sent_topics_1)+len(sent_topics_2)} {topic}" for i, topic in enumerate(sent_model_3.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]

# Combine all labels 
sent_rep_docs = [f"-1 {sent_initial_rep_doc[0]}"] + sent_topics_1_rep_doc + sent_topics_2_rep_doc + sent_topics_3_rep_doc

In [243]:
# Setup representative docs for merged model
para_initial_rep_doc = ["Outlier documents"]

# Get topics from each model, skipping the outlier topic 
para_topics_1_rep_doc = [f"{topic}" for i, topic in enumerate(para_model_1.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]
para_topics_2_rep_doc = [f"{topic}" for i, topic in enumerate(para_model_2.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]
para_topics_3_rep_doc = [f"{topic}" for i, topic in enumerate(para_model_3.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]

# Combine all labels 
para_rep_docs = [f"{para_initial_rep_doc[0]}"] + para_topics_1_rep_doc + para_topics_2_rep_doc + para_topics_3_rep_doc

In [176]:
# Setup representative docs for merged model
doc_initial_rep_doc = ["Outlier documents"]

# Get topics from each model, skipping the outlier topic 
doc_topics_1_rep_doc = [f"{i} {topic}" for i, topic in enumerate(doc_model_1.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]
doc_topics_2_rep_doc = [f"{i+len(doc_topics_1)} {topic}" for i, topic in enumerate(doc_model_2.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]
doc_topics_3_rep_doc = [f"{i+len(doc_topics_1)+len(doc_topics_2)} {topic}" for i, topic in enumerate(doc_model_3.get_topic_info()['Representative_Docs'].to_list()[1:], start=0)]

# Combine all labels 
doc_rep_docs = [f"-1 {doc_initial_rep_doc[0]}"] + doc_topics_1_rep_doc + doc_topics_2_rep_doc + doc_topics_3_rep_doc

#### CREATE MERGED MODELS

In [58]:
# Sentence merged model
sent_merged_model = BERTopic.merge_models([sent_model_1, sent_model_2, sent_model_3], min_similarity = 10.0, embedding_model=sent_embedding_model)

# Paragraph merged model 
para_merged_model = BERTopic.merge_models([para_model_1, para_model_2, para_model_3], min_similarity = 10.0, embedding_model=para_embedding_model)

In [177]:
# document merged model 
doc_merged_model = BERTopic.merge_models([doc_model_1, doc_model_2, doc_model_3], min_similarity = 10.0, embedding_model=doc_embedding_model)

In [77]:
# Set merged model labels 
# Setence model
sent_merged_model.set_topic_labels(sent_labels)


In [188]:
# Paragraph model
para_merged_model.set_topic_labels(para_labels)

In [193]:
# Set document model labels 
doc_merged_model.set_topic_labels(doc_labels)

In [61]:
# Create dataframes

# Sentence model
sent_df = sent_merged_model.get_topic_info()
sent_df["Representative_Docs"] = sent_rep_docs # reapply representative docs
sent_df.to_csv("####.csv")



In [244]:
# Paragraph model
para_df = para_merged_model.get_topic_info()
para_df["Representative_Docs"] = para_rep_docs # reapply representative docs
para_df.to_csv("####.csv")

In [245]:
para_df.head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,-1,28640,-1_said_state_people_fires,-1 Outlier topic,"[said, state, people, fires, county, forest, n...","[wildfires, wildfire, fires, evacuation, droug...",[Wildfires and Risk Mitigation],"[said, state, people, fires, county, forest, n...",Outlier documents
1,0,611,0_maui_hawaii_lahaina_hawaiian,0 TDM - ['Maui Wildfire Mitigation Summit'],"[maui, hawaii, lahaina, hawaiian, island, hono...","[maui wildfires, maui wildfire, maui fires, ma...",[Maui Wildfire Mitigation Summit],"[maui, hawaii, lahaina, hawaiian, island, hono...","['we are maui strong', 'our factcheck sources ..."
2,1,503,1_police_court_man_trial,1 TDM - ['Legal battle over murder conviction'],"[police, court, man, trial, officers, prosecut...","[defendants, accused, prosecutors, conviction,...",[Legal battle over murder conviction],"[police, court, man, trial, officers, prosecut...",['new water crisis drag show threats nurses st...
3,2,458,2_home_house_flames_lost,2 TDM - ['Neighborhood Destruction Recovery'],"[home, house, flames, lost, saw, just, said, f...","[neighbor, neighbors, houses, homes, lived, ho...",[Neighborhood Destruction Recovery],"[home, house, flames, lost, saw, just, said, f...",['good friends and neighbors houses were demol...
4,3,294,3_forest_forests_forest management_funding,3 TDM - ['Wildfire Prevention Funding Allocati...,"[forest, forests, forest management, funding, ...","[reduce wildfire, forest management, wildfires...",[Wildfire Prevention Funding Allocation],"[forest, forests, forest management, funding, ...",['she also attributed the departments success ...


In [179]:
# Document model
doc_df = doc_merged_model.get_topic_info()
doc_df["Representative_Docs"] = doc_rep_docs # reapply representative docs
doc_df.to_csv("####.csv")

#### CREATE FULL HIERARCHICAL VISUALIZATION

In [86]:
import plotly.io as pio

# Sentence model
sent_fig = sent_merged_model.visualize_hierarchy(custom_labels=True)
pio.write_image(sent_fig, '####.png')  # Save the figure as a PNG file

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [88]:
# Sentence model
para_fig = para_merged_model.visualize_hierarchy(custom_labels=True)
pio.write_image(para_fig, '####.png')  # Save

In [195]:
# Document model
doc_fig = doc_merged_model.visualize_hierarchy(custom_labels=True)
pio.write_image(doc_fig, '####.png')  # Save

#### UMAP VISUALIZATION

#### SENTENCE LEVEL

In [113]:
## Sentence Visual 
# Umap settings
sent_umap_embeddings = UMAP(n_neighbors=15, min_dist=0.0, n_components=300, metric='cosine', random_state=42).fit_transform(sent_merged_model.topic_embeddings_)

In [114]:
# Cluster the embeddings
sent_hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom')
sent_cluster_labels = sent_hdbscan_model.fit_predict(sent_umap_embeddings)

In [115]:
print(f"\nHDBSCAN found {len(set(sent_cluster_labels))-1} meta-clusters\n")


HDBSCAN found 41 meta-clusters



In [180]:
import plotly.graph_objs as go

# Function to assign shapes based on keywords in labels
def assign_shapes(labels):
    shapes = []
    for label in labels:
        if "TDM" in label:
            shapes.append('circle')
        elif "Reddit" in label:
            shapes.append('square')
        elif "WCS" in label:
            shapes.append('triangle-up')
        else:
            shapes.append('circle')  # Default shape
    return shapes

# Assign shapes based on the sent_labels
shapes = assign_shapes(sent_labels)

# Create a scatter plot
sent_embed_fig = go.Figure(data=go.Scatter(
    x=sent_umap_embeddings[:, 0],
    y=sent_umap_embeddings[:, 1],
    mode='markers',
    text=sent_labels,  # This will be shown on hover
    marker=dict(
        size=8,
        color=sent_cluster_labels,  # Assign a unique color to each cluster
        colorscale='Rainbow',  # Color scale to use
        colorbar=dict(title='Cluster'),
        line=dict(width=1, color='DarkSlateGrey'),
        opacity=0.7, 
        symbol=shapes  # Use the assigned shapes
    )
))

# Update layout to add title and hover mode
sent_embed_fig.update_layout(
    title="Visualization of Topic Embeddings for Sentence Model",
    hovermode='closest',
    width=1000, 
    height=1000
)

# Show the figure
sent_embed_fig.show()

In [181]:
# Sentence model save

import plotly.offline as pyo

pyo.plot(sent_embed_fig, filename='####', auto_open=False)

'/Users/jeffreysachs/Documents/ProjectsEnvs/bert_embed/data/sent_embedding_visualization.html'

#### PARAGRAPH MODEL

In [127]:
# Paragraph Visual 

# Umap settings
para_umap_embeddings = UMAP(n_neighbors=15, min_dist=0.0, n_components=300, metric='cosine', random_state=42).fit_transform(para_merged_model.topic_embeddings_)

In [128]:
# Cluster the embeddings
para_hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom')
para_cluster_labels = para_hdbscan_model.fit_predict(para_umap_embeddings)

In [129]:
print(f"\nHDBSCAN found {len(set(para_cluster_labels))-1} meta-clusters\n")


HDBSCAN found 21 meta-clusters



In [190]:
import plotly.graph_objs as go

# Assign shapes based on the sent_labels
shapes = assign_shapes(para_labels)

# Create a scatter plot
para_embed_fig = go.Figure(data=go.Scatter(
    x=para_umap_embeddings[:, 0],
    y=para_umap_embeddings[:, 1],
    mode='markers',
    text=para_labels,  # This will be shown on hover
    marker=dict(
        size=8,
        color=para_cluster_labels,  # Assign a unique color to each cluster
        colorscale='Rainbow',  # Color scale to use
        colorbar=dict(title='Cluster'),
        line=dict(width=1, color='DarkSlateGrey'),
        opacity=0.7, 
        symbol=shapes  # Use the assigned shapes
    )
))

# Update layout to add title and hover mode
para_embed_fig.update_layout(
    title="Visualization of Topic Embeddings for Paragraph Model",
    hovermode='closest',
    width=1000, 
    height=1000
)

# Show the figure
para_embed_fig.show()

In [215]:
# colorbar_dict = dict(
#     title='Cluster',
#     tickvals=list(range(21)),  # Assuming clusters are labeled from 0 to 20
#     ticktext=[str(i) for i in range(21)]  # Label each tick with the cluster number
# )

# # Update the marker dictionary in your scatter plot to include the customized colorbar
# marker_dict = dict(
#     size=8,
#     color=para_cluster_labels,  # Assign a unique color to each cluster
#     colorscale='Rainbow',  # Color scale to use
#     colorbar=colorbar_dict,
#     line=dict(width=1, color='DarkSlateGrey'),
#     opacity=0.7, 
#     symbol=shapes  # Use the assigned shapes
# )

# # Use the updated marker_dict in your scatter plot
# para_embed_fig2 = go.Figure(data=go.Scatter(
#     x=para_umap_embeddings[:, 0],
#     y=para_umap_embeddings[:, 1],
#     mode='markers',
#     text=para_labels,
#     marker=marker_dict
# ))


# # Update layout to add title and hover mode
# para_embed_fig2.update_layout(
#     title="Visualization of Topic Embeddings for Paragraph Model",
#     hovermode='closest',
#     width=1000, 
#     height=1000
# )

# # Show the figure
# para_embed_fig2.show()

In [191]:
# paragraph model save

import plotly.offline as pyo

pyo.plot(para_embed_fig, filename='####', auto_open=False)

'/Users/jeffreysachs/Documents/ProjectsEnvs/bert_embed/data/para_embedding_visualization.html'

#### DOCUMENTS

In [202]:
# Document Level Visual 
doc_umap_embeddings = UMAP(n_neighbors=5, min_dist=0.0, n_components=100, metric='cosine', random_state=42).fit_transform(doc_merged_model.topic_embeddings_)

In [206]:
# Cluster the embeddings
doc_hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom')
doc_cluster_labels = doc_hdbscan_model.fit_predict(doc_umap_embeddings)

In [207]:
print(f"\nHDBSCAN found {len(set(doc_cluster_labels))-1} meta-clusters\n")


HDBSCAN found 16 meta-clusters



In [268]:
# Assign shapes based on the sent_labels
shapes = assign_shapes(doc_labels)

# Create a scatter plot
doc_embed_fig = go.Figure(data=go.Scatter(
    x=doc_umap_embeddings[:, 0],
    y=doc_umap_embeddings[:, 1],
    mode='markers',
    text=doc_labels,  # This will be shown on hover
    marker=dict(
        size=8,
        color=doc_cluster_labels,  # Assign a unique color to each cluster
        colorscale='Rainbow',  # Color scale to use
        colorbar=dict(title='Cluster'),
        line=dict(width=1, color='DarkSlateGrey'),
        opacity=0.7, 
        symbol=shapes  # Use the assigned shapes
    )
))

# Update layout to add title and hover mode
doc_embed_fig.update_layout(
    title="Visualization of Topic Embeddings for Document Model",
    hovermode='closest',
    width=1000, 
    height=1000
)

# Show the figure
doc_embed_fig.show()

In [269]:
# Saving figure
pyo.plot(doc_embed_fig, filename='####', auto_open=False)

'/Users/jeffreysachs/Documents/ProjectsEnvs/bert_embed/data/doc_embedding_visualization.html'

#### SUMMARIZING CLUSTERS 


In [246]:
## Add clusters to dataframes

para_df["Cluster"] = para_cluster_labels

In [248]:
## Save df
para_df.to_csv("####.csv")

In [247]:
para_df.head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,MMR,Representative_Docs,Cluster
0,-1,28640,-1_said_state_people_fires,-1 Outlier topic,"[said, state, people, fires, county, forest, n...","[wildfires, wildfire, fires, evacuation, droug...",[Wildfires and Risk Mitigation],"[said, state, people, fires, county, forest, n...",Outlier documents,15
1,0,611,0_maui_hawaii_lahaina_hawaiian,0 TDM - ['Maui Wildfire Mitigation Summit'],"[maui, hawaii, lahaina, hawaiian, island, hono...","[maui wildfires, maui wildfire, maui fires, ma...",[Maui Wildfire Mitigation Summit],"[maui, hawaii, lahaina, hawaiian, island, hono...","['we are maui strong', 'our factcheck sources ...",15
2,1,503,1_police_court_man_trial,1 TDM - ['Legal battle over murder conviction'],"[police, court, man, trial, officers, prosecut...","[defendants, accused, prosecutors, conviction,...",[Legal battle over murder conviction],"[police, court, man, trial, officers, prosecut...",['new water crisis drag show threats nurses st...,9
3,2,458,2_home_house_flames_lost,2 TDM - ['Neighborhood Destruction Recovery'],"[home, house, flames, lost, saw, just, said, f...","[neighbor, neighbors, houses, homes, lived, ho...",[Neighborhood Destruction Recovery],"[home, house, flames, lost, saw, just, said, f...",['good friends and neighbors houses were demol...,17
4,3,294,3_forest_forests_forest management_funding,3 TDM - ['Wildfire Prevention Funding Allocati...,"[forest, forests, forest management, funding, ...","[reduce wildfire, forest management, wildfires...",[Wildfire Prevention Funding Allocation],"[forest, forests, forest management, funding, ...",['she also attributed the departments success ...,20


In [302]:
def summarize_cluster(cluster, df, openai_client):
    """
    Summarize a cluster of topics without exceeding token limits.
    """

    
    # text = " ".join(df[df["Cluster"] == cluster]["Representative_Docs"])[0:50000]

    keywords = [word for keywords in df[df["Cluster"] == cluster]["KeyBERT"] for word in keywords]

    def create_summarization_request(keywords, openai_client):
        return openai_client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": f"""
                        I have a cluster of topics that are represented by the following keywords: {keywords}

                        Based on the information above, generate a short and descriptive cluster label of at most 5 words. Make sure it is in the following format:
                        <cluster label>
                        """}
            ],
            temperature=0.5,
            seed=1356
        )

    # Use the OpenAI summarization model to summarize the topic
    completion = create_summarization_request(keywords, openai_client)

    return completion.choices[0].message.content

In [303]:
cluster_labels = {} # Get all cluster labels
for i in range(-1, 21):
    cluster_labels[i] = summarize_cluster(i, para_df, client)
    

In [304]:
cluster_labels.items()

dict_items([(-1, 'Wildfire Prevention and Management'), (0, 'Wildland Firefighter Training and Careers'), (1, 'Climate Change Impact and Responses'), (2, 'Misinformation and Content Policy Enforcement'), (3, 'Political conflicts and conspiracy theories'), (4, 'Water Conservation and Management'), (5, 'Intense battle and supernatural beings'), (6, 'Fire Emblem Gameplay and Diversity'), (7, 'Air Quality and Wildfires Cluster'), (8, 'Wildfire Evacuation and Closure Information'), (9, 'Legal Justice and Social Issues'), (10, 'Public Health Measures and Impacts'), (11, 'California Environmental Policy and Elections'), (12, 'Wildlife Conservation and Habitat Management'), (13, 'Local Oregon Reporters and Writers'), (14, 'Urban Housing and Transportation Challenges'), (15, 'Wildfire Management and Response'), (16, 'Forest Carbon Offset Solutions'), (17, 'Wildfire Response and Impact'), (18, 'Wildfire Weather Events Cluster'), (19, 'Giant Sequoia Forest Conservation'), (20, 'Wildfire Managemen

In [318]:
totals = []
tdm_counts = []
reddit_counts = []
wcs_counts = []
for i in range(-1, 21):
    totals.append(para_df[(para_df['Cluster'] == i)]['Count'].sum())
    tdm_counts.append(para_df[(para_df['Cluster'] == i) & (para_df['CustomName'].str.contains('TDM'))]['Count'].sum())
    reddit_counts.append(para_df[(para_df['Cluster'] == i) & (para_df['CustomName'].str.contains('Reddit'))]['Count'].sum())
    wcs_counts.append(para_df[(para_df['Cluster'] == i) & (para_df['CustomName'].str.contains('WCS'))]['Count'].sum())

In [327]:
cluster_percentages = {}
for i in range(22):
    cluster_percentages[i] = {
        "Total":int((totals[i]/sum(totals)*100)), 
        "TDM": int((tdm_counts[i]/totals[i])*100), 
        "Reddit": int((reddit_counts[i]/totals[i])*100), 
        "WCS": int((wcs_counts[i]/totals[i])*100)
                               }

In [335]:
labels_with_percentages = [
    f"{key + 1} - {value} <br> Total: {cluster_percentages[key + 1]['Total']}% - TDM: {cluster_percentages[key + 1]['TDM']}% - Reddit: {cluster_percentages[key + 1]['Reddit']}% - WCS: {cluster_percentages[key + 1]['WCS']}%"
    for key, value in cluster_labels.items()
]

In [333]:
labels_with_percentages

['0 - Wildfire Prevention and Management \n Total: 10% - TDM: 59% - Reddit: 39% - WCS: 0%',
 '1 - Wildland Firefighter Training and Careers \n Total: 9% - TDM: 4% - Reddit: 95% - WCS: 0%',
 '2 - Climate Change Impact and Responses \n Total: 12% - TDM: 9% - Reddit: 90% - WCS: 0%',
 '3 - Misinformation and Content Policy Enforcement \n Total: 0% - TDM: 0% - Reddit: 100% - WCS: 0%',
 '4 - Political conflicts and conspiracy theories \n Total: 1% - TDM: 6% - Reddit: 93% - WCS: 0%',
 '5 - Water Conservation and Management \n Total: 2% - TDM: 90% - Reddit: 9% - WCS: 0%',
 '6 - Intense battle and supernatural beings \n Total: 1% - TDM: 0% - Reddit: 100% - WCS: 0%',
 '7 - Fire Emblem Gameplay and Diversity \n Total: 0% - TDM: 0% - Reddit: 100% - WCS: 0%',
 '8 - Air Quality and Wildfires Cluster \n Total: 1% - TDM: 73% - Reddit: 26% - WCS: 0%',
 '9 - Wildfire Evacuation and Closure Information \n Total: 0% - TDM: 100% - Reddit: 0% - WCS: 0%',
 '10 - Legal Justice and Social Issues \n Total: 1% -

In [343]:
# Assign shapes based on the sent_labels
shapes = assign_shapes(para_labels)

colorbar_dict = dict(
    # title='Cluster',
    tickvals=list(range(-1, 21)),  
    ticktext= labels_with_percentages # Label each tick with the cluster number
)

# Update the marker dictionary in your scatter plot to include the customized colorbar
marker_dict = dict(
    size=8,
    color=para_cluster_labels,  # Assign a unique color to each cluster
    colorscale='Rainbow',  # Color scale to use
    colorbar=colorbar_dict,
    line=dict(width=1, color='DarkSlateGrey'),
    opacity=0.7, 
    symbol=shapes  # Use the assigned shapes
)

# Use the updated marker_dict in your scatter plot
para_embed_fig2 = go.Figure(data=go.Scatter(
    x=para_umap_embeddings[:, 0],
    y=para_umap_embeddings[:, 1],
    mode='markers',
    text=[f"Cluster: {cluster + 1} - Topic: {label}" for cluster, label in zip(para_cluster_labels, para_labels)],
    marker=marker_dict
))


# Update layout to add title and hover mode
para_embed_fig2.update_layout(
    title="Visualization of Topic Embeddings for Paragraph Model",
    hovermode='closest',
    width=1300, 
    height=1100
)

# Show the figure
para_embed_fig2.show()

In [344]:
# paragraph model save

import plotly.offline as pyo

pyo.plot(para_embed_fig2, filename='####', auto_open=False)

'/Users/jeffreysachs/Documents/ProjectsEnvs/bert_embed/data/para_embedding_visualization_with_labels.html'

In [364]:
def group_labels(text):
    if "TDM" in text:
        return "TDM"
    elif "Reddit" in text:
        return "Reddit"
    elif "WCS" in text:
        return "WCS"
    elif "Outlier" in text:
        return "Outlier"

In [366]:
para_df['Group'] = para_df['CustomName'].apply(group_labels)

In [368]:
# Calculate the total counts for each group
group_totals = para_df.groupby('Group')['Count'].transform('sum')

In [381]:
# Convert raw number to percentages
topic_percentages = round((para_df['Count'].values / group_totals) * 100, 1)

In [392]:
# Lower outlier 
topic_percentages[0] = 0

In [439]:
# Extract 'Count' values from para_df to use as sizes. You might need to adjust this line to match your DataFrame structure.
max_size = 200
marker_sizes = np.minimum(topic_percentages*10, max_size)

# Update the marker dictionary in your scatter plot to include dynamic sizes based on 'Count'
marker_dict = dict(
    size=marker_sizes,  # Use 'Count' for marker sizes
    color=para_cluster_labels,  # Assign a unique color to each cluster
    colorscale='Rainbow',  # Color scale to use
    colorbar=colorbar_dict,
    line=dict(width=1, color='DarkSlateGrey'),
    opacity=0.7,
    symbol=shapes  # Use the assigned shapes
)

# Use the updated marker_dict in your scatter plot
para_embed_fig2 = go.Figure(data=go.Scatter(
    x=para_umap_embeddings[:, 0],
    y=para_umap_embeddings[:, 1],
    mode='markers',
    text=[f"Cluster: {cluster + 1} - Topic: {label} - Vol: {topic_p} % - Count: {count}" for cluster, label, topic_p, count in zip(para_cluster_labels, para_labels, topic_percentages, para_df['Count'])],
    marker=marker_dict
))

para_embed_fig2.add_shape(
    type="rect",
    x0=14,  # x coordinate of the bottom left corner
    y0=1.45,  # y coordinate of the bottom left corner
    x1=14.9,  # x coordinate of the top right corner
    y1=1.7,  # y coordinate of the top right corner
    line=dict(
        color="RoyalBlue",
        width=2,
    ),
    fillcolor="LightSkyBlue",
    opacity=0.3,  # Adjust opacity here for transparency
)

para_embed_fig2.add_shape(
    type="rect",
    x0=14,  # x coordinate of the bottom left corner
    y0=0.3,  # y coordinate of the bottom left corner
    x1=14.95,  # x coordinate of the top right corner
    y1=0.8,  # y coordinate of the top right corner
    line=dict(
        color="RoyalBlue",
        width=2,
    ),
    fillcolor="LightSkyBlue",
    opacity=0.3,  # Adjust opacity here for transparency
)

para_embed_fig2.add_shape(
    type="rect",
    x0=13.5,  # x coordinate of the bottom left corner
    y0=0.9,  # y coordinate of the bottom left corner
    x1=15.3,  # x coordinate of the top right corner
    y1=1.2,  # y coordinate of the top right corner
    line=dict(
        color="RoyalBlue",
        width=2,
    ),
    fillcolor="LightSkyBlue",
    opacity=0.3,  # Adjust opacity here for transparency
)

# Add an annotation inside the rectangle
para_embed_fig2.add_annotation(
    x=14.35,  # x coordinate of the annotation
    y=1.22,  # y coordinate of the annotation
    text="TDM Majority: Incidents, Challenges, Problems, Findings, Strategies",  # Annotation text
    showarrow=False,
    font=dict(
        size=14,
        color="Navy"
    ),
    borderwidth=2,
    borderpad=10,
)

# Add an annotation inside the rectangle
para_embed_fig2.add_annotation(
    x=14.5,  # x coordinate of the annotation
    y=0.815,  # y coordinate of the annotation
    text="WCS Majority: Initiatives, Grants, Research, Projects, Policies",  # Annotation text
    showarrow=False,
    font=dict(
        size=14,
        color="Navy"
    ),
    borderwidth=2,
    borderpad=10,
)

# Add an annotation inside the rectangle
para_embed_fig2.add_annotation(
    x=14.5,  # x coordinate of the annotation
    y=1.68,  # y coordinate of the annotation
    text="Reddit Majority: Information Seeking, Recommendations, Sharing",  # Annotation text
    showarrow=False,
    font=dict(
        size=14,
        color="Navy"
    ),
    borderwidth=2,
    borderpad=10,
)

# Update layout to add title and hover mode
para_embed_fig2.update_layout(
    title="Visualization of Topic Embeddings for Paragraph Model",
    hovermode='closest',
    width=1300, 
    height=1100,
    xaxis=dict(range=[13, 15.5]),  # Set x-axis range
    yaxis=dict(range=[0.25, 1.7])  # Set y-axis range
)

# Show the figure
para_embed_fig2.show()

In [440]:
# paragraph model save

import plotly.offline as pyo

pyo.plot(para_embed_fig2, filename='####', auto_open=False)

'/Users/jeffreysachs/Documents/ProjectsEnvs/bert_embed/data/para_embedding_visualization_with_boxes.html'