In [16]:
from scipy.stats import spearmanr
from statsmodels.stats.multitest import fdrcorrection
import pandas as pd
import numpy as np
import geopy.distance
import os
import matplotlib.pyplot as plt
import seaborn as sns
from skbio import TreeNode
from skbio.diversity import beta_diversity
from scipy.spatial.distance import pdist, squareform
from skbio.tree import TreeNode
from io import StringIO

In [2]:
# load relative abundance data for each ecotype
abun_relabun = pd.read_csv("input/abun_relabun.csv", index_col = 0)
rare_relabun = pd.read_csv("input/rare_relabun.csv", index_col = 0)
gen_relabun = pd.read_csv("input/gen_relabun.csv", index_col = 0)
spe_relabun = pd.read_csv("input/spe_relabun.csv", index_col = 0)

# Read the phylogenetic tree from the Newick file
with open("input/new_rOTU_tree.nwk", 'r') as file:
    newick_string = file.read()

# Parse the Newick string and create a TreeNode object
tree = TreeNode.read(StringIO(newick_string))
rooted_tree = tree.root_at_midpoint()

# Shannon-Weiner diversity for each ecotype

In [120]:
def shannon_diversity(relabun):

    # Remove zeros to avoid log(0) issues
    relabun = relabun.replace(0, np.nan).dropna()

    # Calculate Shannon entropy
    entropy = -np.sum(relabun * np.log(relabun))

    return entropy

# Calculate Shannon-Weiner Diversity for each sample
abun_shannon = abun_relabun.apply(shannon_diversity, axis=0)
rare_shannon = rare_relabun.apply(shannon_diversity, axis=0)
gen_shannon = gen_relabun.apply(shannon_diversity, axis=0)
spe_shannon = spe_relabun.apply(shannon_diversity, axis=0)

In [121]:
df_shannon = pd.concat([abun_shannon, rare_shannon, gen_shannon, spe_shannon], axis=1)
column_names = ['Abundant taxa', 'Rare taxa', 'Generalists', 'Specialists']
df_shannon.columns = column_names

df_shannon.to_csv("output/ecotypes_shannon.csv")

# Weighted Unifrac distance for each ecotype

In [64]:
def weighted_unifrac_distance(relative_abundance, rooted_tree):

    relative_abundance.index = relative_abundance.index.str.replace('_', ' ', regex=False) # Replace underscores in the index
    relative_abundance = relative_abundance.multiply(1000) # multiply by 1000 otherwise the values are too small; all distances would be zero

    # Ensure that OTUs present in the phylogenetic tree are also present in the abundance DataFrame
    common_otus = set(relative_abundance.index) & set(n.name for n in rooted_tree.tips())
    relative_abundance_com = relative_abundance[relative_abundance.index.isin(common_otus)]

    # Calculate the UniFrac distance matrix
    unifrac_distance_matrix = beta_diversity("weighted_unifrac", relative_abundance_com.values.T, ids=relative_abundance_com.columns, taxa=relative_abundance_com.index, tree=rooted_tree)
    distance_df = pd.DataFrame(data=unifrac_distance_matrix.data,columns=unifrac_distance_matrix.ids, index=unifrac_distance_matrix.ids)

    return distance_df

In [65]:
distance_df_abun = weighted_unifrac_distance(abun_relabun, rooted_tree)
distance_df_abun.to_csv('output/weighted_unifrac_abun.csv')

distance_df_rare = weighted_unifrac_distance(rare_relabun, rooted_tree)
distance_df_rare.to_csv('output/weighted_unifrac_rare.csv')

distance_df_gen = weighted_unifrac_distance(gen_relabun, rooted_tree)
distance_df_gen.to_csv('output/weighted_unifrac_gen.csv')

distance_df_spe = weighted_unifrac_distance(spe_relabun, rooted_tree)
distance_df_spe.to_csv('output/weighted_unifrac_spe.csv')

# Bray-Curtis distance for each ecotype

In [35]:
def bray_curtis_distance(relative_abundance):

    # Calculate the the pairwise Bray-Curtis distances
    bc_distance_matrix = pdist(relative_abundance.T, metric='braycurtis')

    # Convert the distance matrix to a square form
    bc_distance_matrix_square = squareform(bc_distance_matrix)

    distance_df = pd.DataFrame(bc_distance_matrix_square, index=relative_abundance.columns, columns=relative_abundance.columns)

    return distance_df

In [37]:
bc_distance_df_abun = bray_curtis_distance(abun_relabun)

bc_distance_df_rare = bray_curtis_distance(rare_relabun)

bc_distance_df_gen = bray_curtis_distance(gen_relabun)

bc_distance_df_spe = bray_curtis_distance(spe_relabun)