In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# In case you want to filter the data to retain features from only a specific superclass

canopus = pd.read_csv ('Malpigh_Filter_EtOH80_NEG_CANOPUS.txt', sep='\t')

filter_by_superclass = 'Benzenoids'  # Select the desired superclass
filtered = canopus.loc[canopus['canopus_superclass'] == filter_by_superclass]

filtered.to_csv("Malpigh_Filter_EtOH80_NEG_CANOPUS_benzenoids.tsv",sep='\t',index=True) 

In [3]:
#  Create binary matrices of ClassyFire chemical classes at the direct parent and subclass level 

canopus = pd.read_csv ('Malpigh_Filter_EtOH80_POS_CANOPUS.txt', sep='\t')
ft = pd.read_csv ('Malpigh_Filtered_EtOH80_FeatureTable_POS_samplenames_updated.csv', sep=';')
ft = ft.rename(columns = {'sample_name':'cluster.index'})

comb = pd.merge(canopus, ft, on = 'cluster.index')

taxons = ['canopus_superclass', 'canopus_class', 'canopus_subclass', 
       'canopus_level5','canopus_most_specific_class']

hierarchical_df = comb.groupby(taxons).sum() #sum or whatever is most appropiate for your data

hierarchical_df.to_csv("hierarchical_CANOPUS_POS.tsv",sep='\t',index=True)

In [4]:
# Create binary matrix of CANOPUS chemical classes at the superclass level

df = pd.read_csv('hierarchical_CANOPUS_POS.tsv',sep='\t')
taxons = ['canopus_superclass', 'canopus_class', 'canopus_subclass', 
       'canopus_level5','canopus_most_specific_class']
df = df[taxons]
df['type'] = df[['canopus_superclass', 'canopus_class', 
       'canopus_level5','canopus_most_specific_class']].values.tolist()  #remove the one you want to keep

en = pd.get_dummies(df.type.apply(lambda x: pd.Series([i for i in x])))
en.index = df.canopus_subclass    #select classification hierarchy
en = en[~en.index.duplicated(keep='first')]

en.to_csv("classlist_subclass_POS.tsv",sep='\t',index=True)

In [2]:
# Create count matrix of chemical subclasses   

canopus = pd.read_csv ('Malpigh_Filter_EtOH80_POS_CANOPUS.txt', sep='\t')
ft = pd.read_csv ('Malpigh_Filtered_EtOH80_FeatureTable_POS_samplenames_updated.csv', sep=';')

In [3]:
#introduce cut-off of min. intensity 1000

rowID = list(ft['sample_name'])
ft[ft < 1000] = 0
ft['sample_name'] = rowID

In [9]:
#introduce cut-off of min. intensity 1000

ft = ft.rename(columns = {'sample_name':'cluster.index'})

comb = pd.merge(canopus, ft, on="cluster.index")

supercl = comb.canopus_subclass.unique()   #select classification hierarchy

supercl_df = []
for i in range(len(supercl)):
    sel = comb.loc[comb['canopus_subclass'] == supercl[i]]   #select classification hierarchy
    out = sel.astype(bool).sum(axis=0)
    out = out.to_dict()
    supercl_df.append(out)
    

df = pd.DataFrame(supercl_df)
df.insert(loc=0, column='id', value= list(supercl))
df = df.drop('cluster.index', 1)
df = df[df.columns.drop(list(df.filter(regex='canopus')))] # this dataframe contains samples in columns and superclasses in rows, the numeric values describe number of molecules within the corresponding superclasses

df.to_csv("featuretable_updated_subclass_POS_cutoff1000.tsv",sep='\t',index=False)

In [10]:
########################

In [11]:
#group by genera, if desired.

df = pd.read_csv('featuretable_updated_subclass_POS_cutoff1000.tsv', sep='\t')
df.columns = [col_name.split('_')[0] for col_name in df.columns]
df_agregado = df.groupby(df.columns, axis=1).sum()
df_agregado = df_agregado.set_index(['id'])
df_agregado.to_csv('featuretable_updated_subclass_POS_cutoff1000_genera.tsv', sep='\t')