In [None]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import copy
import seaborn as sns

#Gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import HdpModel

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, PCA
from sklearn.manifold import TSNE, MDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

Functions to output and save table, graph, and topic breakdowns given LDA model.

In [None]:
def topicBreakdown(df, lda, dictionary, filename):
    taxonomy = pd.read_csv('../metag_topic_modeling/data_sets/HMP_V13_taxonomy_fix.csv')
    result = ''

    for i in range(lda.num_topics):
        # get top 10 words for this topic
        top_terms = lda.get_topic_terms(i, topn=10)  # list of (token_id, weight)

        result += f"Topic {i}: "
        for term_id, weight in top_terms:
            word = dictionary[term_id]

            # match taxonomy
            row = taxonomy[taxonomy['OTU_ID'] == word]
            if not row.empty:
                result += str(row.iat[0, 6]) + ' '
            else:
                result += "Not Found "
        result += "\n"

    output_filename = f"results/{filename}_topic_breakdown.txt"
    with open(output_filename, 'w') as f:
        f.write(result)

    print(f"Topic breakdown saved to {output_filename}")
    print(result)

In [None]:
def bubblePlot1(df, lda, dictionary, filename):
   
    taxonomy = pd.read_csv('../metag_topic_modeling/data_sets/HMP_V13_taxonomy_fix.csv')
    topic_data = []

    for i in range(lda.num_topics):
        # get top 10 words for topic
        top_terms = lda.get_topic_terms(i, topn=10)

        for term_id, importance in top_terms:
            word = dictionary[term_id]
            row = taxonomy[taxonomy['OTU_ID'] == word]

            if not row.empty:
                family_name = row.iat[0, 6]  # adjust index if needed
            else:
                family_name = "Not Found"

            topic_data.append({"Topic": f"Topic {i}", "Family": family_name, "Importance": importance})

    # convert to dataframe
    df_plot = pd.DataFrame(topic_data)

    # aggregate importance values for each (Topic, Family)
    df_agg = df_plot.groupby(["Topic", "Family"], as_index=False).sum()

    # bubble plot
    plt.figure(figsize=(10, 6))
    bubble = sns.scatterplot(
        data=df_agg,
        x="Topic",
        y="Family",
        size="Importance",
        hue="Importance",
        sizes=(20, 1000),
        palette="viridis",
        edgecolor="black",
        alpha=0.7
    )

    plt.xticks(rotation=45)
    plt.xlabel("Topic")
    plt.ylabel("Taxonomic Family")
    plt.title("Bubble Plot of Taxonomic Families per Topic")
    plt.legend(title="Importance")
    plt.tight_layout()

    # Save and show
    output_filename = f"results/{filename}_topic_bubble_plot.png"
    plt.savefig(output_filename, dpi=300)
    plt.show()
    print(f"Bubble plot saved to {output_filename}")

Original graph

In [None]:
body_sites = pd.read_csv('../metag_topic_modeling/data_sets/HMP_V13_participant_data.csv')

body_site_mapping = {site: idx for idx, site in enumerate(body_sites['HMP_BODY_SITE'].unique())}

body_site_ints = body_sites['HMP_BODY_SITE'].map(body_site_mapping)

In [None]:
def bubblePlot2(df, lda, dictionary, filename):

    # convert df to gensim corpus
    corpus = []
    for row in df.itertuples(index=False):
        doc = []
        for word, count in zip(df.columns, row):
            doc.extend([word] * int(count))
        corpus.append(dictionary.doc2bow(doc))

    # find topic distributions for each document
    topic_distributions = []
    for doc_bow in corpus:
        doc_topics = lda.get_document_topics(doc_bow, minimum_probability=0)
        # Convert to full vector
        topic_vector = np.array([prob for _, prob in sorted(doc_topics, key=lambda x: x[0])])
        topic_distributions.append(topic_vector)
    topic_distributions = np.array(topic_distributions)

    # assign strongest topic
    strongest_topic_indices = topic_distributions.argmax(axis=1)
    body_sites['Strongest_Topic'] = strongest_topic_indices

    # find topic counts by site
    topic_counts_by_site = body_sites.groupby(['HMP_BODY_SITE', 'Strongest_Topic']).size().reset_index(name="Count")
    print(topic_counts_by_site)

    table_filename = f"results/{filename}_table.csv"
    topic_counts_by_site.to_csv(table_filename, index=False)
    print(f"Table saved to {table_filename}")

    # bubble plot
    plt.figure(figsize=(12, 7))
    bubble = sns.scatterplot(
        data=topic_counts_by_site,
        x="Strongest_Topic",
        y="HMP_BODY_SITE",
        size="Count",
        hue="Count",
        sizes=(20, 1000),
        palette="viridis",
        edgecolor="black",
        alpha=0.7
    )

    plt.xticks(rotation=45)
    plt.xlabel("Strongest Topic")
    plt.ylabel("Body Site")
    plt.title("Bubble Plot of Topic Assignments by Body Site")
    plt.legend(title="Sample Count", loc='upper right')
    plt.tight_layout()

    output_filename = f"results/{filename}_bubble_plot.png"
    plt.savefig(output_filename, dpi=300)
    plt.show()
    print(f"Bubble plot saved to {output_filename}")

In [None]:
df = pd.read_csv("../metag_topic_modeling/data_sets/HMP_V13_OTU_counts.csv")
df = df.drop(columns = ['PSN'])

# Standardize the data
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

# Initial dimensionality reduction
pca = PCA(n_components=50)
reduced_df = pca.fit_transform(scaled_df)

# Dimensionality reduction for visualization
tsne = TSNE(n_components=2, init='pca', random_state=0)
result = tsne.fit_transform(reduced_df)

In [None]:
custom_colors = ['red', 'blue', 'green', 'yellow', 'purple']#, 'orange', 'pink', 'brown', 'olive', 'cyan']
cmap = ListedColormap(custom_colors)

# Plot with body sites
# Red = gut, blue = oral, green = airways, yellow = skin, purple = urogenital
fig = plt.figure(1, figsize=(8, 8))
plt.clf()
scatter = plt.scatter(result[:, 0], result[:, 1], c=body_site_ints, cmap=cmap, s=15)
plt.savefig("body_site_plot.svg")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np

def outputTableandGraph(df, lda, dictionary, result, filename):
    
    # convert df to gensim corpus
    corpus = []
    for row in df.itertuples(index=False):
        doc = []
        for word, count in zip(df.columns, row):
            doc.extend([word] * int(count))
        corpus.append(dictionary.doc2bow(doc))

    # compute topic distributions for each document
    topic_distributions = []
    for doc_bow in corpus:
        doc_topics = lda.get_document_topics(doc_bow, minimum_probability=0)
        # convert to vector
        topic_vector = np.array([prob for _, prob in sorted(doc_topics, key=lambda x: x[0])])
        topic_distributions.append(topic_vector)
    topic_distributions = np.array(topic_distributions)

    # assign strongest topic
    strongest_topic_indices = topic_distributions.argmax(axis=1)
    body_sites['Strongest_Topic'] = strongest_topic_indices

    # aggregate by body site
    topic_counts_by_site = body_sites.groupby(['HMP_BODY_SITE', 'Strongest_Topic']).size().unstack(fill_value=0)
    print(topic_counts_by_site)

    # save table
    table_filename = f"results/{filename}_table.csv"
    topic_counts_by_site.to_csv(table_filename)
    print(f"Table saved to {table_filename}")

    # map topics to integers for plotting
    LDA_mapping = {topic: idx for idx, topic in enumerate(body_sites['Strongest_Topic'].unique())}
    LDA_ints = body_sites['Strongest_Topic'].map(LDA_mapping)

    # plot
    custom_colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'pink', 'brown', 'olive', 'cyan']
    cmap = ListedColormap(custom_colors)

    fig = plt.figure(1, figsize=(8, 8))
    plt.clf()
    scatter = plt.scatter(result[:, 0], result[:, 1], c=LDA_ints, cmap=cmap, s=15)
    plot_filename = f"results/{filename}_comp_plot_.svg"
    plt.savefig(plot_filename)
    print(f"Plot saved to {plot_filename}")
    plt.show()

Function to find optimal component number for each taxonomic level by perplexity, returning a graph and a table at that level.

Combined function

In [None]:
def completeAnalysis(file_name):
    
    df = pd.read_csv(f"data_sets/matrix/preprocessed_table_{file_name}.csv")
    df = df.drop(df.columns[0], axis=1)  # drop index column if present

    texts = [] # making dictionary
    for row in df.itertuples(index=False):
        doc = []
        for word, count in zip(df.columns, row):
            doc.extend([word] * int(count))
        texts.append(doc)

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    hdp = HdpModel(corpus=corpus, id2word=dictionary) # run HDP to find topic numbers

    topic_count = len(hdp.show_topics(formatted=False)) # output topic #
    print(f"HDP inferred ~{topic_count} topics")

    topicBreakdown(df, hdp, dictionary, file_name) # output plots
    bubblePlot1(df, hdp, dictionary, file_name)
    bubblePlot2(df, hdp, dictionary, file_name)
    outputTableandGraph(df, hdp, dictionary, file_name)

    return hdp, df, dictionary, corpus

In [None]:
for i in [0,1]:
    for j in [0,1,2]:
        completeAnalysis(f"{i}{j}")