In [26]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import copy

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, PCA
from sklearn.manifold import TSNE, MDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [27]:
gut_sites = pd.read_csv('../hurwitzlab/data_sets/HMP_V13_participant_data.csv')

gut_sites = gut_sites[gut_sites['HMP_BODY_SITE'] == 'Gastrointestinal Tract']

gut_site_mapping = {site: idx for idx, site in enumerate(gut_sites['HMP_BODY_SUBSITE'].unique())}

gut_site_ints = gut_sites['HMP_BODY_SUBSITE'].map(gut_site_mapping)

body_sites = pd.read_csv('../hurwitzlab/data_sets/HMP_V13_participant_data.csv')

In [None]:
print(gut_sites['HMP_BODY_SUBSITE'].unique())  # Check the unique values in the column

In [None]:
df = pd.read_csv("../hurwitzlab/data_sets/HMP_V13_OTU_counts.csv")
df = df.drop(columns = ['PSN'])

df['body_sites'] = body_sites['HMP_BODY_SITE']

df = df[df['body_sites'] == 'Gastrointestinal Tract']

df = df.drop(columns=['body_sites'])

# Standardize the data
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

# Initial dimensionality reduction
pca = PCA(n_components=50)
reduced_df = pca.fit_transform(scaled_df)

# Dimensionality reduction for visualization
tsne = TSNE(n_components=2, init='pca', random_state=0)
result = tsne.fit_transform(reduced_df)

In [None]:
custom_colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'pink', 'brown', 'olive', 'cyan']
cmap = ListedColormap(custom_colors)

# Plot with body sites
# Red = gut, blue = oral, green = airways, yellow = skin, purple = urogenital
fig = plt.figure(1, figsize=(8, 8))
plt.clf()
scatter = plt.scatter(result[:, 0], result[:, 1], c=gut_site_ints, cmap=cmap, s=15)
plt.savefig("gut_site_plot.svg")
plt.show()

In [30]:
def outputTableandGraph(df, lda, tax_level, filter):

    frequency_table = df.values
    topic_distributions = lda.transform(frequency_table)

    strongest_topic_indices = topic_distributions.argmax(axis=1)

    gut_sites['Strongest_Topic'] = strongest_topic_indices

    topic_counts_by_site = gut_sites.groupby(['HMP_BODY_SUBSITE', 'Strongest_Topic']).size().unstack(fill_value=0)

    print(topic_counts_by_site)

    LDA_mapping = {site: idx for idx, site in enumerate(gut_sites['Strongest_Topic'].unique())}

    LDA_ints = gut_sites['Strongest_Topic'].map(LDA_mapping)

    custom_colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'pink', 'brown', 'olive', 'cyan']
    cmap = ListedColormap(custom_colors)

    fig = plt.figure(1, figsize=(8, 8))
    plt.clf()
    scatter = plt.scatter(result[:, 0], result[:, 1], c=LDA_ints, cmap=cmap, s=15)
    plt.savefig(tax_level + "_gut_comp_plot_" + str(filter) + ".svg")
    plt.show()

In [46]:
def findGutComponentNum(tax_level, file_name, filter):
    df = pd.read_csv('../hurwitzlab/data_sets/' + file_name + '.csv')
    df = df.drop(columns = ['PSN'])

    # Processing df to only be gut sites
    df['body_sites'] = body_sites['HMP_BODY_SITE']
    df = df[df['body_sites'] == 'Gastrointestinal Tract']
    df = df.drop(columns=['body_sites'])

    threshold = 10

    if filter == 1:
        df = df.loc[:, (df != 0).sum(axis=0) > threshold]

    frequency_table = df.values

    # Initial LDA with 2 components
    bestLDA = LatentDirichletAllocation(n_components=2, random_state=0)
    bestLDA.fit(frequency_table)
    lowestPerplexity = bestLDA.perplexity(frequency_table)
    bestComponentNum = 2

    print(bestComponentNum, ',', lowestPerplexity, '\n')

    componentNum = 3

    while componentNum <= 10:
        LDA = LatentDirichletAllocation(n_components=componentNum, random_state=0)
        LDA.fit(frequency_table)
        perplexity = LDA.perplexity(frequency_table)

        print(componentNum, ', ', perplexity, '\n')

        if perplexity < lowestPerplexity:
            bestLDA = copy.deepcopy(LDA)
            lowestPerplexity = perplexity
            bestComponentNum = componentNum  # Update best component number
        else:
            # Stop if perplexity starts increasing
            break

        componentNum += 1

    print(tax_level, '- Best Component number:', bestComponentNum, ', Perplexity:', lowestPerplexity, '\n')

    return bestLDA, df

In [50]:
def topicBreakdown(df, lda, input, output):
    vocab = df.columns

    result = ''

    for i, comp in enumerate(lda.components_):
        vocab_comp = zip(vocab, comp)
        sorted_words = sorted(vocab_comp, key=lambda x: x[1], reverse=True)[:10]  # Get top 10 words for the topic
        result += ("Topic " + str(i) + ": ")

        # Return the vocab words directly
        for t in sorted_words:
            result += (str(t[0]) + ' ')  # Only add the vocab word (ignoring taxonomy)

        result += "\n"

    print(result)

In [51]:
def completeGutAnalysis(tax_level, file_name, filter, inputLevel, outputLevel):
    lda, df = findGutComponentNum(tax_level, file_name, filter)
    topicBreakdown(df, lda, inputLevel, outputLevel)
    outputTableandGraph(df, lda, file_name, filter)
    return df, lda

In [None]:
Phylum_df, Phylum_lda = completeGutAnalysis('Phylum', 'HMP_V13_Phylum_counts', 0, '', '')

In [None]:
Phylum_df, Phylum_lda = completeGutAnalysis('Phylum', 'HMP_V13_Phylum_counts', 1, '', '')

In [None]:
family_df, family_lda = completeGutAnalysis('family', 'HMP_V13_family_counts', 0, '', '')

In [None]:
family_df, family_lda = completeGutAnalysis('family', 'HMP_V13_family_counts', 1, '', '')

In [None]:
topic_distributions = family_lda.transform(frequency_table)

In [None]:
OTU_df, OTU_lda = completeGutAnalysis('OTU', 'HMP_V13_OTU_counts', 0, '', '')

In [None]:
OTU_df, OTU_lda = completeGutAnalysis('OTU', 'HMP_V13_OTU_counts', 1, '', '')

Output topic distributions by sample
Fix strongest topic output