## Retrieve ClassyFire consensus chemical classes per mass spectral molecular family

**Author**: Madeleine Ernst (mernst@ucsd.edu)

load libraries

In [1]:
import pandas as pd  
import csv  
from classyfire import * # retrieved from https://github.com/DorresteinLaboratory/NAP_ProteoSAFe/tree/master/formatdb
from classyfire_consensus import * 
import os

load results from <i>in silico</i> predicted structres trhough Network Annotation Propagation (NAP) downloaded from https://proteomics2.ucsd.edu/ProteoSAFe/status.jsp?task=af425ada55d54adca9c7b28a823af54c (Download Summary Report, unzip folder, load file within folder named 'final_out') 

In [2]:
nap1 = pd.read_csv("node_attributes_table.tsv", sep='\t')

In [3]:
nap1.head()

Unnamed: 0,cluster.index,parent.mass,number.of.spectra,RTMean,sum.precursor.intensity.,LibraryID,SpectrumID,Smiles,INCHI,ProteoSAFeClusterLink,MetFragScore,MetFragSMILES,MetFragID,FusionScore,FusionSMILES,FusionID,ConsensusScore,ConsensusSMILES,ConsensusID
0,1,81.52,54,599.74356,1316240,,,,,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,,,,,,,,,
1,4,81.52,8,599.32475,165116,,,,,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,,,,,,,,,
2,5,81.52,9,599.43322,221988,,,,,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,,,,,,,,,
3,7,87.967,247,600.07314,5365030,,,,,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,,,,,,,,,
4,8,88.018,23,599.94643,562220,,,,,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,,,,,,,,,


### Convert all unique SMILES to InChIKeys

retrieve all unique SMILES structures

In [4]:
matches = [nap1]

In [5]:
out = unique_smiles(matches)
df = out['df']

write unique SMILES structures to file and convert to InChIKeys using MolConverter <br> https://chemaxon.com/marvin-archive/3.3.3/marvin/doc/user/molconvert.html

In [6]:
df.to_csv("SMILES_Amazon.csv", quoting=csv.QUOTE_NONE, escapechar='&')

In [11]:
os.system('/Applications/MarvinSuite/bin/molconvert inchikey:SAbs /Users/madeleineernst/Documents/AmazonProject/SMILES_Amazon.csv{csv:strucSMILES} -o /Users/madeleineernst/Documents/AmazonProject/InchiKeys_Amazon.txt')

0

### Submit unique InChIKeys to ClassyFire

In [6]:
df = pd.read_csv("SMILES_Amazon.csv",  sep=',', index_col = 0)

In [7]:
ikeys = pd.read_csv("InchiKeys_Amazon.txt",  sep='\t',header = None)
ikeys = [j for i in ikeys.values.tolist() for j in i]
ikeys = [w.replace('InChIKey=', '') for w in ikeys]

In [8]:
df.head()

Unnamed: 0,SMILES
0,CCCCC[C@@H]1Cc2cc(c(cc2c2c1c1c(cc2OC)O[C@H]([C...
1,C[C@@H]1CN([C@@H](C)CO)C(=O)c2ccccc2c2ccccc2CO...
2,C[C@@]12CC[C@@H]3[C@]([C@H]1CC[C@H]([C@H]2CC(=...
3,c1cn([C@H]2C[C@@H]([C@@H](CO)O2)OP(=O)(O)OC[C@...
4,[H][C@@](COC(=O)CCCCCCC\C=C/CCCCCCCC)(COP(O)(O...


In [9]:
df["inchikey"] = ikeys

In [10]:
df.head()

Unnamed: 0,SMILES,inchikey
0,CCCCC[C@@H]1Cc2cc(c(cc2c2c1c1c(cc2OC)O[C@H]([C...,UMMCDLJLAGZVAD-QBHLMYNQSA-N
1,C[C@@H]1CN([C@@H](C)CO)C(=O)c2ccccc2c2ccccc2CO...,QXIWLMKZHSHPSY-WENCSYSZSA-N
2,C[C@@]12CC[C@@H]3[C@]([C@H]1CC[C@H]([C@H]2CC(=...,CKEZEKFAEZIJLA-UQMLBVKCSA-N
3,c1cn([C@H]2C[C@@H]([C@@H](CO)O2)OP(=O)(O)OC[C@...,OBCJQWSXSLYWHI-WKSZEZMPSA-N
4,[H][C@@](COC(=O)CCCCCCC\C=C/CCCCCCCC)(COP(O)(O...,PFNZHCWBORLUPP-RDHALYEDSA-N


submit all unique InChIKeys to ClassyFire

In [None]:
classy = query_inchikey(list(set(ikeys)))

In [20]:
classy.head()

Unnamed: 0,class,direct_parent,inchikey,kingdom,molecular_framework,subclass,superclass
0,Steroids and steroid derivatives,Stigmastanes and derivatives,KOMFLQBTVSCQMC-DEDJOHNFSA-N,Organic compounds,Aliphatic homopolycyclic compounds,Stigmastanes and derivatives,Lipids and lipid-like molecules
1,Prenol lipids,Aromatic monoterpenoids,ZNBRGKRSUQTAJM-LSQMVHIFSA-O,Organic compounds,Aromatic heteromonocyclic compounds,Monoterpenoids,Lipids and lipid-like molecules
2,Prenol lipids,"Eudesmanolides, secoeudesmanolides, and deriva...",AYWGUJVWIMXXNN-AVCOQLNBSA-O,Organic compounds,Aromatic heteropolycyclic compounds,Terpene lactones,Lipids and lipid-like molecules
3,Prenol lipids,Aromatic monoterpenoids,DULRAFVTZYWWOW-JXGGLGGMSA-O,Organic compounds,Aromatic heteropolycyclic compounds,Monoterpenoids,Lipids and lipid-like molecules
4,Carboxylic acids and derivatives,Hexacarboxylic acids and derivatives,KVNXXXFZZIWSSE-VRVGQERDSA-N,Organic compounds,Aliphatic acyclic compounds,Hexacarboxylic acids and derivatives,Organic acids and derivatives


In [23]:
classy.to_csv("Classy_Amazon.csv", sep = '\t')

### Retrieve most abundant chemical classes per mass spectral molecular family

In [11]:
classy = pd.read_csv("Classy_Amazon.csv", sep='\t',index_col =0)

In [12]:
classy.head()

Unnamed: 0,class,direct_parent,inchikey,kingdom,molecular_framework,subclass,superclass
0,Steroids and steroid derivatives,Stigmastanes and derivatives,KOMFLQBTVSCQMC-DEDJOHNFSA-N,Organic compounds,Aliphatic homopolycyclic compounds,Stigmastanes and derivatives,Lipids and lipid-like molecules
1,Prenol lipids,Aromatic monoterpenoids,ZNBRGKRSUQTAJM-LSQMVHIFSA-O,Organic compounds,Aromatic heteromonocyclic compounds,Monoterpenoids,Lipids and lipid-like molecules
2,Prenol lipids,"Eudesmanolides, secoeudesmanolides, and deriva...",AYWGUJVWIMXXNN-AVCOQLNBSA-O,Organic compounds,Aromatic heteropolycyclic compounds,Terpene lactones,Lipids and lipid-like molecules
3,Prenol lipids,Aromatic monoterpenoids,DULRAFVTZYWWOW-JXGGLGGMSA-O,Organic compounds,Aromatic heteropolycyclic compounds,Monoterpenoids,Lipids and lipid-like molecules
4,Carboxylic acids and derivatives,Hexacarboxylic acids and derivatives,KVNXXXFZZIWSSE-VRVGQERDSA-N,Organic compounds,Aliphatic acyclic compounds,Hexacarboxylic acids and derivatives,Organic acids and derivatives


In [13]:
df = pd.merge(df, classy, on="inchikey", how = 'left')

In [14]:
df.head()

Unnamed: 0,SMILES,inchikey,class,direct_parent,kingdom,molecular_framework,subclass,superclass
0,CCCCC[C@@H]1Cc2cc(c(cc2c2c1c1c(cc2OC)O[C@H]([C...,UMMCDLJLAGZVAD-QBHLMYNQSA-N,Flavonoids,7-O-methylated flavonoids,Organic compounds,Aromatic heteropolycyclic compounds,O-methylated flavonoids,Phenylpropanoids and polyketides
1,C[C@@H]1CN([C@@H](C)CO)C(=O)c2ccccc2c2ccccc2CO...,QXIWLMKZHSHPSY-WENCSYSZSA-N,Benzene and substituted derivatives,Benzamides,Organic compounds,Aromatic heteropolycyclic compounds,Benzoic acids and derivatives,Benzenoids
2,C[C@@]12CC[C@@H]3[C@]([C@H]1CC[C@H]([C@H]2CC(=...,CKEZEKFAEZIJLA-UQMLBVKCSA-N,Steroids and steroid derivatives,16-oxosteroids,Organic compounds,Aliphatic heteropolycyclic compounds,Oxosteroids,Lipids and lipid-like molecules
3,c1cn([C@H]2C[C@@H]([C@@H](CO)O2)OP(=O)(O)OC[C@...,OBCJQWSXSLYWHI-WKSZEZMPSA-N,(3'->5')-dinucleotides and analogues,(3'->5')-dinucleotides,Organic compounds,Aromatic heteropolycyclic compounds,(3'->5')-dinucleotides,"Nucleosides, nucleotides, and analogues"
4,[H][C@@](COC(=O)CCCCCCC\C=C/CCCCCCCC)(COP(O)(O...,PFNZHCWBORLUPP-RDHALYEDSA-N,Glycerophospholipids,"1,2-diacylglycerol-3-phosphates",Organic compounds,Aliphatic acyclic compounds,Glycerophosphates,Lipids and lipid-like molecules


In [15]:
df = df.rename(columns = {'class':'CF_class'})

load mass spectral molecular network data downloaded from https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=549cfafcdaef4a7496768f45bb90771c (Download Cytoscape Data, unzip folder, load file within folder named 'clusterinfosummarygroup_attributes_withIDs_withcomponentID')

In [16]:
net = pd.read_csv("83507be33e0d4aa984b7d43b0a1d127b.tsv",  sep='\t')

In [17]:
net.head()

Unnamed: 0,cluster index,number of spectra,parent mass,precursor charge,precursor mass,sum(precursor intensity),2012,2013,Checherta,Iquitos,...,DefaultGroups,RTMean,RTStdErr,ProteoSAFeClusterLink,UniqueFileSourcesCount,EvenOdd,LibraryID,NumberOrganismIDs,AllOrganisms,componentindex
0,684980,4,867.6,0,867.6,199760.0,0,4,0,0,...,"G1,",452.58625,1.862014,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,4,0,,0,,-1
1,524499,64,437.344,0,437.344,2105100.0,64,0,30,7,...,"G1,",483.722594,1.254104,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,52,0,,0,,1
2,681603,6,837.589,0,837.589,271948.0,0,6,0,0,...,"G1,",467.4925,1.127573,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,6,0,,0,,-1
3,393578,3,330.265,0,330.265,79260.0,3,0,3,0,...,"G1,",241.924,0.982704,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,2,1,,0,,-1
4,684238,3,865.621,0,865.621,178028.0,0,3,0,0,...,"G1,",526.001667,1.147844,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,3,0,,0,,-1


In [18]:
final = molfam_classes(net,df,out['dic'])

In [19]:
final.head()

Unnamed: 0,cluster index,componentindex,CF_NrNodes,CF_kingdom,CF_kingdom_score,CF_superclass,CF_superclass_score,CF_class,CF_class_score,CF_subclass,CF_subclass_score,CF_Dparent,CF_Dparent_score,CF_MFramework,CF_MFramework_score
0,524499,1,6,Organic compounds,1,Lipids and lipid-like molecules,0.916667,Prenol lipids,0.494444,Triterpenoids,0.272222,Triterpenoids,0.272222,Aliphatic homopolycyclic compounds,0.672222
1,524547,1,6,Organic compounds,1,Lipids and lipid-like molecules,0.916667,Prenol lipids,0.494444,Triterpenoids,0.272222,Triterpenoids,0.272222,Aliphatic homopolycyclic compounds,0.672222
2,524551,1,6,Organic compounds,1,Lipids and lipid-like molecules,0.916667,Prenol lipids,0.494444,Triterpenoids,0.272222,Triterpenoids,0.272222,Aliphatic homopolycyclic compounds,0.672222
3,524511,1,6,Organic compounds,1,Lipids and lipid-like molecules,0.916667,Prenol lipids,0.494444,Triterpenoids,0.272222,Triterpenoids,0.272222,Aliphatic homopolycyclic compounds,0.672222
4,524516,1,6,Organic compounds,1,Lipids and lipid-like molecules,0.916667,Prenol lipids,0.494444,Triterpenoids,0.272222,Triterpenoids,0.272222,Aliphatic homopolycyclic compounds,0.672222


write per node chemical class output table

In [20]:
final.to_csv("ClassyFire_InputforCytoscape_Amazon.csv", sep = '\t', index = False)

### Create chemical class feature table

load per node chemical class output table

In [25]:
cf = pd.read_csv('ClassyFire_InputforCytoscape_Amazon_New.csv',sep='\t') 

load mass spectral molecular bucket table, downloaded from https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=549cfafcdaef4a7496768f45bb90771c

In [26]:
ft = pd.read_csv('METABOLOMICS-SNETS-549cfafc-download_cluster_buckettable-main.tsv',sep='\t') 

calculate number of nodes per chemical superclass

In [27]:
cf = cf.rename(columns = {'cluster index':'cluster.index'})
ft = ft.rename(columns = {'#OTU ID':'cluster.index'})

comb = pd.merge(cf, ft, on="cluster.index")
rem = ['componentindex']

comb = comb.drop(rem, 1)

subcl = comb.CF_superclass.unique()

subcl_df = []
for i in range(len(subcl)):
    sel = comb.loc[comb['CF_superclass'] == subcl[i]] 
    out = sel.astype(bool).sum(axis=0)
    out = out.to_dict()
    subcl_df.append(out)

df = pd.DataFrame(subcl_df)
df.insert(loc=0, column='id', value= list(subcl))
df = df.drop('cluster.index', 1)
df = df[df.columns.drop(list(df.filter(regex='CF')))] 
df = df[df.id != 'no matches']
df = df[-df['id'].isnull()]

In [28]:
df.to_csv("featuretable_superclass.tsv",sep='\t',index=False)