In [2]:
import pandas as pd
import numpy as np
import qiime2
from qiime2 import Artifact
from qiime2.plugins import feature_table, metadata
from biom import load_table

In [9]:
# # Load the CSV file
# kegg_df = {
#     "Name": ["tryptophan 2,3-dioxygenase", "indoleamine 2,3-dioxygenase", "arylformamidase", "kynureninase", "CCBL", "kynurenine/2-aminoadipate aminotransferase", "kynurenine aminotransferase", "kynurenine 3-monooxygenase", "3-hydroxyanthranilate 3,4-dioxygenase", "tryptophan 5-monooxygenase", "phenylalanine-4-hydroxylase", "aromatic-L-amino-acid/L-tryptophan decarboxylase", "monoamine oxidase"],
#     "Other_Name": ["TDO2, kynA", "IDO", "kynB", "kynU", "", "AADAT, KAT2", "BNA3", "KMO", "HAAO", "TPH1_2", "PAH", "DDC, TDC, AAAD", "MAO"],
#     "KO_ortholog": ["K00453", "K00463", "K07130", "K01556", "K00816", "K00825", "K14264", "K00486", "K00452", "K00502", "K00500", "K01593", "K00274"],
#     "EC": ["1.13.11.11", "1.13.11.52", "3.5.1.9", "3.7.1.3", "2.6.1.7", "2.6.1.7", "2.6.1.7", "1.14.13.9", "1.13.11.6", "1.14.16.4", "1.14.16.1", "4.1.1.28", "1.4.3.4"],
#     "Tryptophan_Pathway": ["kynurinine", "kynurinine", "kynurinine", "kynurinine", "kynurinine", "kynurinine", "kynurinine", "kynurinine", "kynurinine", "serotonin", "serotonin", "serotonin", "serotonin"]
# }

# kegg_df = pd.DataFrame(kegg_df)

# Load the CSV file
kegg_df = pd.read_csv('data/kegg/tryptophan_kegg.csv')

# Remove any rows with NaN values in the 'K0/ortholog' column
kegg_df = kegg_df.dropna(subset=['K0/ortholog'])

kegg_df.to_csv('data/kegg/tryptophan_kegg.tsv', sep='\t', index=False)



# Set the index name to a valid QIIME 2 identifier
kegg_df.set_index('K0/ortholog', inplace=True)
# kegg_df.index.name = 'feature id'

# Ensure all index values are strings
kegg_df.index = kegg_df.index.astype(str)
kegg_df.head()

Unnamed: 0_level_0,Name,Other Name,EC,Tryptophan Pathway
K0/ortholog,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
K00453,"tryptophan 2,3-dioxygenase","TDO2, kynA",1.13.11.11,kynurinine
K00463,"indoleamine 2,3-dioxygenase",IDO,1.13.11.52,kynurinine
K07130,arylformamidase,kynB,3.5.1.9,kynurinine
K01556,kynureninase,kynU,3.7.1.3,kynurinine
K00816,CCBL,,2.6.1.7,kynurinine


In [10]:
# # Create a QIIME 2 Metadata object from the DataFrame
# kegg_metadata = qiime2.Metadata(kegg_df)

# Load the BIOM file
biom_table = load_table('/home/lpatel/projects/2024-03-04_pierce-autism/out/195334_ko.zebra.biom')

In [23]:
# Convert BIOM table to pandas DataFrame
ko_feature_table = biom_table.to_dataframe().transpose()

# Ensure the index is string-based
ko_feature_table.index = ko_feature_table.index.astype(str)

# Get the list of KO identifiers from our KEGG DataFrame
ko_list = kegg_df.index.tolist()

# Filter the KO feature table to include only the KOs in our list
filtered_ko_table = ko_feature_table.loc[ko_feature_table.index.isin(ko_list)]

# Ensure the filtered table has a string-based index and integer values
filtered_ko_table = filtered_ko_table.astype(int)
filtered_ko_table.index = filtered_ko_table.index.astype(str)

# Print information about the filtered table
print("Filtered table info:")
print(filtered_ko_table.info())
print("\nIndex dtype:", filtered_ko_table.index.dtype)

# Convert the filtered table to a QIIME 2 Artifact
filtered_ko_artifact = Artifact.import_data("FeatureTable[Frequency]", filtered_ko_table)

# Stratify by metabolic processes
kynurinine_kos = kegg_df[kegg_df['Tryptophan Pathway'] == 'kynurinine'].index.tolist()
serotonin_kos = kegg_df[kegg_df['Tryptophan Pathway'] == 'serotonin'].index.tolist()

# Filter the feature table for kynurinine pathway
kynurinine_table = feature_table.filter_features(table=filtered_ko_artifact, metadata=kegg_metadata.filter(
    'feature id IN ({})'.format(','.join(kynurinine_kos)))).filtered_table

# Filter the feature table for serotonin pathway
serotonin_table = feature_table.filter_features(table=filtered_ko_artifact, metadata=kegg_metadata.filter(
    'feature id IN ({})'.format(','.join(serotonin_kos)))).filtered_table

# Display summary of the stratified tables
print("\nKynurinine Pathway Table Summary:")
print(kynurinine_table.view(pd.DataFrame).sum())

print("\nSerotonin Pathway Table Summary:")
print(serotonin_table.view(pd.DataFrame).sum())


  filtered_ko_table = filtered_ko_table.astype(int)


Filtered table info:
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 6731 entries, K00001 to K25634
dtypes: Sparse[int64, 0](6731)
memory usage: 0.0+ bytes
None

Index dtype: object


TypeError: Please provide a DataFrame with a string-based Index