In [None]:
import pandas as pd
import numpy as np
import sqlite3
from tqdm.auto import tqdm
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import Nmf
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import HdpModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import corpora
import regex as re
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from nltk.tokenize import sent_tokenize
import spacy
import itertools
from joblib import Parallel, delayed
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import time
import matplotlib.pyplot as plt
from bertopic.representation import MaximalMarginalRelevance
import pickle

In [None]:
#Clustered Diseases db from k-means
con = sqlite3.connect("clustered_diseases.db")

df= pd.read_sql_query("SELECT * from clustered_diseases", con)
con.close()

df

In [None]:
for i, r in tqdm(df.iterrows(), total=len(df)):
    
    #Filter Wikipedia-related elements
    #text = re.sub(r'\[.*\]', '', text) 
    text = re.sub(r'^From Wikipedia, the free encyclopedia', '', str(r["text"]), flags=re.MULTILINE)
    text = re.sub(r'Retrieved from.*\s.*', '', text)
    text = re.sub(r'\[.*?\]', ' ', text) 
    text = re.sub(r'^References.*', ' ', text, flags=re.MULTILINE | re.DOTALL)
    text = re.sub(r'^Citations.*', ' ', text, flags=re.MULTILINE | re.DOTALL)
    text = re.sub(r'^Footnotes.*', ' ', text, flags=re.MULTILINE | re.DOTALL)
    #text = re.sub(r'\n', ' ', text)  # Want to keep \n to split into paragraphs
    text = re.sub(r'  ', ' ', text)
    text = re.sub(r'  ', ' ', text)
    text = re.sub(r'[\xa0\u200b\u202f]', ' ', text) # Removes extra artifacts
    text = re.sub(r'\b[A-Za-z]\.',' ', text) #Removes letters in a list like a. 
    #text = re.sub(r'[^\w\s]', '', text) # Removes punctuation (all non text, non whitespace)
    
    df.at[i, "text"] = text
    
    #Create a column with stopwords removed
    df.at[i, "text_no_SW"] =  " ".join([
        word.lower() for word in df.at[i, "text"].split() if word.lower() not in STOP_WORDS])

df

In [None]:
nltk.download('punkt')

def split_paragraphs(text):
    # You can tune this depending on formatting
    raw_paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    
    # Merge small paragraphs (headers) into bigger ones based on sentence count
    paragraphs = []
    buffer = ''
    
    for p in raw_paragraphs:
        buffer += ' ' + p if buffer else p
        if len(sent_tokenize(buffer)) >= 2:
            paragraphs.append(buffer.strip())
            buffer = ''
    if buffer:
        paragraphs.append(buffer.strip())
    
    return paragraphs

In [None]:
rows = []

for _, row in df.iterrows():
    paragraphs = split_paragraphs(row['text'])
    for para in paragraphs:
        rows.append({
            'title' : row['title'],
            'cluster' : row['cluster'],
            'paragraph' : para
        })
        
paragraph_df = pd.DataFrame(rows)
paragraph_df.shape

In [None]:
# Analyze a single sentence
def analyze_sentence(sent):
    nouns = []
    adjectives = []
    verbs = []
    lemmas = []
    nav = []

    for token in sent:
        
        lemmas.append(token.lemma_)
        
        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
           nouns.append(token.lemma_ )
           nav.append(token.lemma_ )
        if token.pos_ == "ADJ" or token.pos_ == "ADV":
            adjectives.append(token.lemma_ )
            nav.append(token.lemma_)
        if token.pos_ == "VERB" or token.pos_ == "AUX":
            verbs.append(token.lemma_ )
            nav.append(token.lemma_)

    return (nouns, adjectives, verbs, lemmas, nav,
            [str(e) for e in sent.ents], [str(nc) for nc in sent.noun_chunks])



# Rejoin the individual parts of the sentences
def resentence(words):
    ###
    # You cannot join sentences with ".", because this can also be part of an entity
    # With "#" this is less likely
    # You can join words with "|"
    # Empty spaces are ignored
    ###
    return "#".join(["|".join([w for w in sent_words if len(w) > 0])
                     for sent_words in words if len(sent_words) > 0])


nlp = spacy.load("en_core_web_lg")

In [None]:
def process_paragraph(i, text, STOP_WORDS):
    import spacy
    
    nlp = spacy.load("en_core_web_lg")
    doc = nlp(text)
    
    nouns, adjectives, verbs, lemmas, nav = [], [], [], [], []
    entities, noun_chunks = [], []

    for sentence in doc.sents:
        (sent_nouns, sent_adjectives, sent_verbs, sent_lemmas,
        sent_nav, sent_entities, sent_noun_chunks) = analyze_sentence(sentence)

        nouns.append(sent_nouns)
        adjectives.append(sent_adjectives)
        verbs.append(sent_verbs)
        nav.append(sent_nav)
        lemmas.append(sent_lemmas)
        entities.append(sent_entities)
        noun_chunks.append(sent_noun_chunks)

    data = {
        "index": i,
        "nouns": resentence(nouns),
        "adjectives": resentence(adjectives),
        "verbs": resentence(verbs),
        "lemmas": resentence(lemmas),
        "nav": resentence(nav),
        "entities": resentence(entities),
        "noun_chunks": resentence(noun_chunks),
        "no_tokens": len(resentence(lemmas).split("|")),
        "no_sentences": len(lemmas),
        "no_noun_chunks": len(resentence(noun_chunks).split("|")),
        "para_no_SW": " ".join([
            word for word in text.split() if word.lower() not in STOP_WORDS
        ])
    }
    
    return data

In [None]:
from joblib import Parallel, delayed

results = Parallel(n_jobs=-1)(
    delayed(process_paragraph)(i, row['paragraph'], STOP_WORDS)
    for i, row in tqdm(paragraph_df.iterrows(), total=len(paragraph_df))
)

In [None]:
result_df = pd.DataFrame(results).set_index("index")
result_df.shape

In [None]:
sql = sqlite3.connect("para_lemmas.db")
result_df.to_sql("para_lemmas", sql, if_exists="replace")

In [None]:
paragraph_df.update(result_df)

In [None]:
sql = sqlite3.connect("para_df.db")
paragraph_df.to_sql("para_df", sql, if_exists="replace")

In [None]:
paragraph_df.head()

In [None]:
def filter_words(text):
    words = re.split(r'\||\#', text)
    return [w.lower() for w in words if len(w)>1 and w.lower() not in STOP_WORDS]

def get_gensim_text(paragraph_df, cluster):
    #Filter to only include specific cluster
    df_cluster = paragraph_df[paragraph_df['cluster'] == cluster]
    
    #Combine nav and entities columns into one list of words 
    text_list = []
    for _, row in df_cluster.iterrows():
        nav_text = filter_words(row['nav']) if pd.notna(row['nav']) else []
        entity_text = filter_words(row['entities']) if pd.notna(row['entities']) else []
        text_list.append(nav_text + entity_text)
        
    return text_list

In [None]:
def coherence_score(model, text, dictionary):
    
    #Function to show coherence score of a model
    
    coherence = CoherenceModel(model=model,
                               texts = text,
                               dictionary = dictionary,
                               coherence = 'c_v',
                               processes=1)
    coherence_score = coherence.get_coherence()
    return coherence_score

In [None]:
def combine_filtered_text(row):
    nav_text = filter_words(row['nav']) if pd.notna(row['nav']) else []
    entity_text = filter_words(row['entities']) if pd.notna(row['entities']) else []
    return nav_text + entity_text

paragraph_df['filtered_text'] = paragraph_df.apply(combine_filtered_text, axis=1)

paragraph_df['filtered_text'][0]

In [None]:
def filter_lemmas(row):
    lemmas = filter_words(row['lemmas']) if pd.notna(row['lemmas']) else []
    return lemmas

paragraph_df['filtered_lemmas '] = paragraph_df.apply(filter_lemmas, axis=1)

paragraph_df['filtered_lemmas'][0]

# BERTopic

In [None]:
def calculate_topic_diversity(model, top_n=10):
    topics = model.get_topics()
    all_words = []
    
    for topic_id, words in topics.items():
        if topic_id == -1:  # exclude outliers
            continue
        top_words = [word for word, _ in words[:top_n]]
        all_words.extend(top_words)
        
    unique_words = set(all_words)
    diversity = len(unique_words) / len(all_words) if all_words else 0
    return diversity

In [None]:
def get_top_words(model, topic_id, n=3):
    " Get top N words for topics from BERTopic models "
    words = model.get_topic(topic_id)
    return ', '.join([word for word, _ in words[:n]]) if words else "No Topic"

In [None]:
def merge_short_paragraphs(df, max_sentences=2):
    df = df.sort_values(['title']).reset_index(drop=True)
    merged_rows = []
    skip_indices = set()
    
    for i in range(len(df)):
        if i in skip_indices:
            # This paragraph has already been merged, skip
            continue
        
        current = df.loc[i]
        if current.no_sentences > max_sentences: 
            # Paragraph is long enough, no changes needed
            merged_rows.append(current.to_dict())
            continue
        
        # Identify neighbors with same title
        prev_idx = i-1 if i > 0 else None
        next_idx = i+1 if i < len(df) - 1 else None
        
        prev_row = df.loc[prev_idx] if prev_idx is not None and df.loc[prev_idx, 'title'] == current.title else None
        next_row = df.loc[next_idx] if next_idx is not None and df.loc[next_idx, 'title'] == current.title else None
        
        # Choose neighbor to merge with (Shorter of the two if there are two)
        candidates = []
        if prev_row is not None and prev_idx not in skip_indices:
            candidates.append(('prev', prev_row.no_sentences))
        if next_row is not None and next_idx not in skip_indices:
            candidates.append(('next', next_row.no_sentences))
            
        if not candidates:
            # No neighbors to merge with, keep as is
            merged_rows.append(current.to_dict())
            continue
        
        # Find neighbor with smallest no_sentences
        best_side = min(candidates, key=lambda x:x[1])[0]
        
        if best_side == 'prev':
            neighbor = prev_row
            neighbor_idx = prev_idx
            merged_text = neighbor.doc_str_lem + " " + current.doc_str_lem
            merged_no_sentences = neighbor.no_sentences + current.no_sentences
            merged_index = min(neighbor['index'], current['index'])
            
        else:
            neighbor = next_row
            neighbor_idx = next_idx
            merged_text = current.doc_str_lem + " " + neighbor.doc_str_lem
            merged_no_sentences = current.no_sentences + neighbor.no_sentences
            merged_index = min(current['index'], neighbor['index'])
            
        merged_paragraph = current.to_dict()
        merged_paragraph.update({
            'doc_str_lem'  : merged_text,
            'no_sentences' : merged_no_sentences,
            'index'        : merged_index
        })
        
        merged_rows.append(merged_paragraph)
        skip_indices.add(neighbor_idx) # Skip merged neighbor
        
    merged_df = pd.DataFrame(merged_rows).sort_values(['title','index']).reset_index(drop=True)
    return merged_df

In [None]:
print(paragraph_df.shape)

print(f"Average number of sentences per paragraph: {paragraph_df['no_sentences'].mean():.2f}")

In [None]:
merged_df = merge_short_paragraphs(paragraph_df, max_sentences=2)

print(merged_df.shape)

print(f"Average number of sentences per paragraph: {merged_df['no_sentences'].mean():.2f}")

In [None]:
merged_df['filtered_lemmas'] = merged_df['doc_str_lem'].apply(lambda x: x.split())

In [None]:
model = SentenceTransformer('all-MiniLM-L6-V2')
corpus_embeddings = model.encode(merged_df['doc_str_lem'].tolist(), show_progress_bar=True, normalize_embeddings=True)

In [None]:
from bertopic.representation import MaximalMarginalRelevance

In [None]:
def evaluate_topic_model(topic_model, docs, lemmas):
    
    all_topics = topic_model.get_topics()
    topic_words = []
    sorted_topic_ids = sorted([t for t in all_topics.keys()])
    for topic_id in sorted_topic_ids:
        words = [word for word, prob in topic_model.get_topic(topic_id)]
        topic_words.append(words)
    dictionary = Dictionary(lemmas)
    dictionary.filter_extremes(no_below=5, no_above=0.7)
    coherence_model = CoherenceModel(topics=topic_words, texts=lemmas, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

def run_hdbscan_grid(cluster_docs, cluster_lemmas, cluster_embeddings, param_grid,
                     embedding_model, umap_model, vectorizer_model, representation_model):
    best_model = None
    best_coherence = -1
    best_params = None
    best_topics = None
    
    for min_cluster_size, min_samples in tqdm(param_grid, desc=f"Grid search cluster {cluster_label}", leave=False):
        hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, prediction_data=True)
        
        topic_model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            representation_model=representation_model,
            verbose=False
        )
        try:
            topics, _ = topic_model.fit_transform(cluster_docs, cluster_embeddings)
            outliers = sum(1 for t in topics if t == -1)
            if outliers >= len(cluster_docs) * 0.5:
                continue # Skip bad models with too many outliers
            coherence = evaluate_topic_model(topic_model, cluster_docs, cluster_lemmas)
            if coherence > best_coherence:
                best_model = topic_model
                best_coherence = coherence
                best_topics = topics
                best_params = {'min_cluster_size': min_cluster_size,
                               'min_samples' : min_samples
                              }
        except Exception as e:
            time.sleep(1) # In case of hugging face rate limit
            continue
    return best_model, best_topics, best_coherence, best_params

In [None]:
#Static Models
umap_model = UMAP(random_state=42)
vectorizer_model = CountVectorizer(ngram_range=(1,2))
representation_model = MaximalMarginalRelevance(diversity=0.5)


# HDBSCAN parameter grid
hdbscan_params = list(product(range(5,16), [1,5,10,15])) #min_cluster_size x min_samples

results = []

for cluster_label in tqdm(merged_df['cluster'].unique()):
    cluster_mask = merged_df['cluster'] == cluster_label
    
    #For BERTopic
    cluster_docs = merged_df.loc[cluster_mask, 'doc_str_lem'].tolist()
    
    #For gensim
    cluster_lemmas = merged_df.loc[cluster_mask, 'filtered_lemmas'].tolist()
    cluster_embeddings = corpus_embeddings[cluster_mask]
    
    best_model, best_topics, best_coherence, best_params = run_hdbscan_grid(
        cluster_docs, cluster_lemmas, cluster_embeddings,
        param_grid=hdbscan_params,
        embedding_model=model,
        umap_model=umap_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model
    )
    
    results.append({
        "cluster"   : cluster_label,
        "model"     : best_model,
        "topics"    : best_topics,
        "coherence" : best_coherence,
        "params"    : best_params
    })

In [None]:
for result in results:
    print(f"\nCluster {result['cluster']}")
    print("Topics:", len(set(result['topics'])))
    print("Best Models:")
    print(f" - Best Coherence Scores: {result['coherence']:.4f}")
    print(f" - Topic Diversity: {calculate_topic_diversity(result['model']):.4f}")
    print(" - Best Parameters:")
    for param_name, param_value in result['params'].items():
        print(f"  - {param_name}: {param_value}")
        
#Saving Results Dictionary, for MRR = 0.5
MRR_results = results

In [None]:
# Best Models From Original Search Grid

#Static Models
umap_model = UMAP(random_state=42)
vectorizer_model = CountVectorizer(ngram_range=(1,2))
representation_model = MaximalMarginalRelevance(diversity=0.5)


cluster_params = {
    0: {"min_cluster_size": 6, "min_samples": 5},
    1: {"min_cluster_size": 5, "min_samples": 5},
    2: {"min_cluster_size": 7, "min_samples": 1},
    3: {"min_cluster_size": 6, "min_samples": 5},
    4: {"min_cluster_size": 5, "min_samples": 5},
    5: {"min_cluster_size": 5, "min_samples": 5},
    6: {"min_cluster_size": 5, "min_samples": 5},
    7: {"min_cluster_size": 7, "min_samples": 1},
    8: {"min_cluster_size": 10, "min_samples": 15}
}

results = {}

for cluster_label in tqdm(merged_df['cluster'].unique()):
    cluster_mask = merged_df['cluster'] == cluster_label
    
    cluster_docs = merged_df.loc[cluster_mask, 'doc_str_lem'].tolist()
    cluster_lemmas = merged_df.loc[cluster_mask, 'filtered_lemmas'].tolist()
    cluster_embeddings = corpus_embeddings[cluster_mask]
    
    params = cluster_params[cluster_label]
    
    hdbscan_model = HDBSCAN(
        min_cluster_size=params['min_cluster_size'],
        min_samples=params['min_samples'],
        prediction_data=True
    )
    
    topic_model = BERTopic(
        embedding_model=model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        verbose=False
    )
    
    topics, probs = topic_model.fit_transform(cluster_docs, cluster_embeddings)
    
    #coherence = evaluate_topic_model(topic_model, cluster_docs, cluster_lemmas)
    
    results[cluster_label] = {
        'cluster' : cluster_label,
        'model': topic_model,
        "topics": topics,
        #"coherence" : coherence,
        "params" : params
    }
    
    #topic_model.save(f"Models/bertopic_cluster_{cluster_label}_redo.model")

## Outlier Handling

In [None]:
def compute_per_cluster_centroids(df, embeddings):
    cluster_topic_embeddings = defaultdict(list)
    
    for idx, row in df.iterrows():
        topic_id = row['topic_number']
        cluster = row['cluster']
        
        if isinstance(topic_id,str) and not topic_id.endswith('_-1'):
            key = (cluster, topic_id)
            cluster_topic_embeddings[key].append(embeddings[idx])
            
    centroids = {
        key : np.mean(vectors, axis=0)
        for key, vectors in cluster_topic_embeddings.items()
    
    }
    
    return centroids

def reassign_outliers_by_cluster(df, embeddings, centroids, similarity_threshold=0.7):
    df = df.copy()
    
    for idx, row in df[df['topic_number'].astype(str).str.endswith('_-1')].iterrows():
        emb = embeddings[idx].reshape(1,-1)
        cluster = row['cluster']
        best_topic = None
        best_score = -1
        
        # Only look at centroids from the same cluster
        for (cl, topic_id), centroid in centroids.items():
            if cl != cluster:
                continue
            sim = cosine_similarity(emb, centroid.reshape(1,-1))[0][0]
            if sim > best_score:
                best_score = sim
                best_topic = topic_id
                
        if best_score >= similarity_threshold:
            df.at[idx, 'topic_number'] = best_topic
            label_mode = df[df['topic_number'] == best_topic]['topic_label'].mode()
            if not label_mode.empty:
                df.at[idx,'topic_label'] = label_mode[0]
                
    return df

In [None]:
def compute_global_centroids(df, embeddings):
    topic_embeddings = defaultdict(list)
    
    for idx, row in df.iterrows():
        topic_id = row['topic_number']        
        if isinstance(topic_id,str) and not topic_id.endswith('_-1'):
            topic_embeddings[topic_id].append(embeddings[idx])
            
    centroids = {
        topic: np.mean(vectors, axis=0)
        for topic, vectors in topic_embeddings.items()
    
    }
    
    return centroids

def reassign_outliers_globally(df, embeddings, centroids, similarity_threshold=0.7):
    df = df.copy()
    
    for idx, row in df[df['topic_number'].astype(str).str.endswith('_-1')].iterrows():
        emb = embeddings[idx].reshape(1,-1)
        
        similarities = {
            topic: cosine_similarity(emb, centroid.reshape(1,-1))[0][0]
            for topic, centroid in centroids.items()
        }
        
        best_topic, best_score = max(similarities.items(), key=lambda x: x[1])
        if best_score >= similarity_threshold:
            df.at[idx, 'topic_number'] = best_topic
            label_match = df[df['topic_number'] == best_topic]['topic_label']
            if not label_match.empty:
                df.at[idx, 'topic_label'] = label_match.iloc[0]

                
    return df

In [None]:
# Update merged_df dataframe with new labels

merged_df['topic_number'] = None
merged_df['topic_label'] = None

for result in results.values():
    cluster = result['cluster']
    topic_ids = result['topics']
    model = result['model']
    
    # Get all paragraph indexes for cluster
    cluster_indices = merged_df[merged_df['cluster'] == cluster].index
    
    # Assign topic # and label to each paragraph
    for i, idx in enumerate(cluster_indices):
        topic_id = topic_ids[i]
        if topic_id == -1:
            combined_id = f"{cluster}_-1"
            label = "Outlier"
        else:
            combined_id = f"{cluster}_{topic_id}"
            label = get_top_words(model, topic_id)
            
        merged_df.at[idx, 'topic_number'] = combined_id
        merged_df.at[idx, "topic_label"] = label
        
# Check, total outliers should = 1739
outliers_merged = len(merged_df[merged_df['topic_label']=="Outlier"])
print(f"Total Outliers: {outliers_merged}")

In [None]:
outliers_merged = len(merged_df[merged_df['topic_label']=="Outlier"])
print(f"Total Outliers: {outliers_merged}")

pct_outliers_merged = outliers_merged/len(merged_df)
print(f"Percent of outlier paragraphs: {pct_outliers_merged*100:.2f}%")

avg_corpus  = np.mean([r["coherence"] for r in results.values()])
print(f"Average Coherence Across Clusters: {avg_corpus:.4f}")

global_topic_ids = set()

for r in results.values():
    model = r['model']
    cluster_label = r['cluster']
    for tid in model.get_topics().keys():
        global_id = f"cluster{cluster_label}_topic{tid}"
        global_topic_ids.add(global_id)
    
total_topics = len(global_topic_ids)
print(f"Total distinct topics across clusters: {total_topics}")

In [None]:
cluster_centroids = compute_per_cluster_centroids(merged_df, corpus_embeddings)
cluster_reassigned_outliers_df = reassign_outliers_by_cluster(merged_df,
                                                             corpus_embeddings,
                                                             cluster_centroids,
                                                             similarity_threshold=0.7)

outliers = len(cluster_reassigned_outliers_df[cluster_reassigned_outliers_df['topic_label']=="Outlier"])
print(f"Total Outliers: {outliers}")

pct_outliers = outliers/len(cluster_reassigned_outliers_df)
print(f"Percent of outlier paragraphs: {pct_outliers*100:.2f}%")

In [None]:
cluster_centroids = compute_per_cluster_centroids(merged_df, corpus_embeddings)

cluster_reassigned_outliers_df = reassign_outliers_by_cluster(merged_df,
                                                             corpus_embeddings,
                                                             cluster_centroids,
                                                             similarity_threshold=0.5)

outliers = len(cluster_reassigned_outliers_df[cluster_reassigned_outliers_df['topic_label']=="Outlier"])
print(f"Total Outliers: {outliers}")

pct_outliers = outliers/len(cluster_reassigned_outliers_df)
print(f"Percent of outlier paragraphs: {pct_outliers*100:.2f}%")

In [None]:
global_centroids = compute_global_centroids(merged_df, corpus_embeddings)
global_reassigned_outliers_df = reassign_outliers_globally(merged_df,
                                                             corpus_embeddings,
                                                             global_centroids,
                                                             similarity_threshold=0.7)

outliers = len(global_reassigned_outliers_df[global_reassigned_outliers_df['topic_label']=="Outlier"])
print(f"Total Outliers: {outliers}")

pct_outliers = outliers/len(global_reassigned_outliers_df)
print(f"Percent of outlier paragraphs: {pct_outliers*100:.2f}%")

In [None]:
global_centroids = compute_global_centroids(merged_df, corpus_embeddings)

global_reassigned_outliers_df = reassign_outliers_globally(merged_df,
                                                             corpus_embeddings,
                                                             global_centroids,
                                                             similarity_threshold=0.5)

outliers = len(global_reassigned_outliers_df[global_reassigned_outliers_df['topic_label']=="Outlier"])
print(f"Total Outliers: {outliers}")

pct_outliers = outliers/len(global_reassigned_outliers_df)
print(f"Percent of outlier paragraphs: {pct_outliers*100:.2f}%")

In [None]:
# Loading original dataframe from .db for subtopic to doc assignment
con = sqlite3.connect("clustered_diseases.db")

doc_df = pd.read_sql_query("SELECT * from clustered_diseases", con)
con.close()

doc_df

In [None]:
# Remove any remaining outliers
global_reassigned_removed_outliers_df = global_reassigned_outliers_df[global_reassigned_outliers_df['topic_number'].str.split('_').str[1] != '-1']

In [None]:
topic_counts = global_reassigned_removed_outliers_df.groupby(['title', 'topic_number']).size().reset_index(name='topic_count')

In [None]:
# Remove any remaining outliers - By Cluster DF
cluster_reassigned_removed_outliers_df = cluster_reassigned_outliers_df[cluster_reassigned_outliers_df['topic_number'].str.split('_').str[1] != '-1']

In [None]:
# Topic Counts by Cluster
topic_counts = cluster_reassigned_removed_outliers_df.groupby(['title', 'topic_number']).size().reset_index(name='topic_count')

In [None]:
title_to_topics = topic_counts.groupby('title')['topic_number'].apply(list).reset_index(name='topic_list')

In [None]:
doc_df = doc_df.merge(title_to_topics, on='title', how='left')

In [None]:
# Add the number of subtopics (length of topic_list) as a new column
doc_df['num_subtopics'] = doc_df['topic_list'].apply(len)

doc_df.sort_values(by='num_subtopics', ascending=False)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.hist(doc_df['num_subtopics'], bins=range(doc_df['num_subtopics'].max() + 2),
        align='left', edgecolor='black')
plt.title('Distribution of Number of Subtopics per Document')
plt.xlabel('Number of Subtopics')
plt.ylabel('Number of Documents')
plt.xticks(range(doc_df['num_subtopics'].max() + 1))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
#persist to db
import json

#Remove lists
doc_df_str = doc_df.copy()
doc_df_str['topic_list_str'] = doc_df_str['topic_list'].apply(json.dumps)
doc_df_str.drop(columns=['topic_list'], inplace=True)
doc_df_str.head()

In [None]:
#Saving doc df with subtopic labels
sql = sqlite3.connect("docs_with_subtopics.db")
doc_df_str.to_sql("docs_with_subtopics", sql, if_exists="replace")

In [None]:
#Saving topic count df

sql = sqlite3.connect("topic_counts.db")
topic_counts.to_sql("topic_counts", sql, if_exists="replace")

## Creating Subtopic Embeddings

In [None]:
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')

subtopic_representations = {}

for cluster_id, result in results.items():
    topic_model = result['model']
    
    topic_info = topic_model.get_topic_info()
    
    for topic_id in topic_info['Topic']:
        if topic_id == -1:
            continue
            
        words = topic_model.get_topic(topic_id)
        if not words:
            continue
            
        rep_str = " ".join([word for word, _ in words[:10]])
        
        subtopic_key = f"{cluster_id}_{topic_id}"
        
        subtopic_representations[subtopic_key] = rep_str

In [None]:
rep_texts = list(subtopic_representations.values())

rep_embeddings = model.encode(rep_texts, convert_to_tensor=True)

subtopic_embeddings = {
    sub_id: emb
    for sub_id, emb in zip(subtopic_representations.keys(), rep_embeddings)
}

In [None]:
subtopic_df = pd.DataFrame({
    "subtopic_id": list(subtopic_representations.keys()),
    "representation": list(subtopic_representations.values())
})

subtopic_df.head()

In [None]:
with open("subtopic_embeddings_NoMRR.pkl", "wb") as f:
    pickle.dump(subtopic_embeddings, f)