# Data import and preparation

## Packages import

In [1]:
## packages import
import pandas as pd
import numpy as np


## CANOPUS predictions

In [2]:
#import
canopus = pd.read_csv('Data/cap_manu/input/acquirex_canopus_compound_summary.tsv', usecols=['NPC#pathway','NPC#pathway Probability','NPC#class','NPC#class Probability','featureId'], sep='\t')        


#create `Alkaloid_NPC` and `Piperidine_alkaloid_NPC` columns
canopus['Alkaloid_NPC'] = canopus['NPC#pathway'].str.contains('Alkaloids')
canopus['Capsaicins and Capsaicinoids'] = canopus['NPC#class'].str.contains('Capsaicins and Capsaicinoids')

canopus

Unnamed: 0,NPC#pathway,NPC#pathway Probability,NPC#class,NPC#class Probability,featureId,Alkaloid_NPC,Capsaicins and Capsaicinoids
0,Amino acids and Peptides,0.682577,Aminoacids,0.707989,1,False,False
1,Alkaloids,0.890238,Pyridine alkaloids,0.481792,3,True,False
2,Amino acids and Peptides,0.652401,Aminoacids,0.312094,6,False,False
3,Alkaloids,0.542376,Polyamines,0.163301,10,True,False
4,Alkaloids,0.593366,Fatty alcohols,0.660198,11,True,False
...,...,...,...,...,...,...,...
1295,Polyketides,0.813784,Open-chain polyketides,0.103341,1525,False,False
1296,Fatty acids,0.980093,Triacylglycerols,0.287804,1526,False,False
1297,Terpenoids,0.477020,Cholane steroids,0.659215,1527,False,False
1298,Fatty acids,0.972856,Monoacylglycerols,0.684749,1529,False,False


## CSI:FingerID predictions

In [3]:
#import
csi_fingerid = pd.read_csv('Data/cap_manu/input/acquirex_compound_identifications.tsv', usecols=['ConfidenceScore', 'CSI:FingerIDScore', 'ZodiacScore','SiriusScore','molecularFormula','name','smiles','pubchemids','featureId'], sep='\t')
csi_fingerid.rename(columns={'smiles':'SIRIUS_smiles', 'molecularFormula':'Molecular Formula', 'name':'compound_name_SIRIUS', 'pubchemids':'pubchemids_SIRIUS '}, inplace=True)


In [4]:
csi_fingerid

Unnamed: 0,ConfidenceScore,CSI:FingerIDScore,ZodiacScore,SiriusScore,Molecular Formula,compound_name_SIRIUS,SIRIUS_smiles,pubchemids_SIRIUS,featureId
0,0.997812,-4.321280,1.000000,39.550136,C15H19NO2,Tropacaine,CN1C2CCC1CC(C2)OC(=O)C3=CC=CC=C3,10834;637578;6919033;6942461;11859117;44300682,157
1,0.996049,-5.019328,0.973869,116.583244,C27H46O9,"[2-hydroxy-3-[(2R,3R,4S,5R,6R)-3,4,5-trihydrox...",CCC=CCC=CCC=CCCCCCCCC(=O)OCC(COC1C(C(C(C(O1)CO...,11497202;44715454;51692835;51692836;51692837;5...,739
2,0.995644,-6.168974,0.973367,134.963580,C27H46O9,"[2-hydroxy-3-[(2R,3R,4S,5R,6R)-3,4,5-trihydrox...",CCC=CCC=CCC=CCCCCCCCC(=O)OCC(COC1C(C(C(C(O1)CO...,11497202;44715454;51692835;51692836;51692837;5...,711
3,0.993020,-9.186601,1.000000,63.025613,C18H27NO3,Civamide,CC(C)C=CCCCCC(=O)NCC1=CC(=C(C=C1)O)OC,2548;1548942;1548943;44632689;57369257;6904797...,469
4,0.992688,-9.933572,1.000000,50.382919,C18H27NO3,Civamide,CC(C)C=CCCCCC(=O)NCC1=CC(=C(C=C1)O)OC,2548;1548942;1548943;44632689;57369257;6904797...,424
...,...,...,...,...,...,...,...,...,...
1075,,-594.769794,0.091457,55.966091,C33H57N11O9,Selanc,CC(C(C(=O)NC(CCCCN)C(=O)N1CCCC1C(=O)NC(CCCN=C(...,14332099;11765600,1303
1076,,-310.350017,0.997487,30.043589,C37H60O14,,CC1C(C(C(C(O1)OC2CC3C(=CC2(C)C=C)CCC4C3(CCC(C4...,,1102
1077,,-411.666580,0.199497,46.777493,C35H60O14,,CCCCCC(C1C(CC(CC(CC(CC(CC(C(C(C(C=CC=CC=CC=CC(...,101561103,1206
1078,,-269.092171,0.078392,50.242643,C35H67NO13,,CC(CCCCCCCCCO)CCCCCCCCOCC(COC1C(C(C(CO1)O)O)O)...,57382001,1280


## GNPS node table

export this table from the cytoscape file

In [5]:
#import
node_table = pd.read_csv('Data/cap_manu/input/Acquire_X_Final_GNPS_Table.csv', usecols = ['Analog:Compound_Name', 'Analog:Smiles', 'cluster index',
                                                                                                 'componentindex', 'Compound_Name','MZErrorPPM'
                                                                                                   , 'shared name', 'Smiles', 'sum(precursor intensity)'])

node_table.rename(columns={'shared name': 'featureId', 'Smiles':'GNPS_Smiles','Compound_Name':'GNPS_Compound_Name'}, inplace=True) 

### Create log-transformed intensity columns
Add `sum(precursor intensity)_Log2` and `sum(precursor intensity)_Log10` columsn to the `node_table`.

In [6]:
#create log-transformed intensity columns
node_table['Log2_intensity'] = np.log2(node_table['sum(precursor intensity)'])
node_table['Log10_intensity'] = np.log10(node_table['sum(precursor intensity)'])

### Create `network_size` column
Count network sizes and add the corresponding `network_size` columns ro the `node_table`.

In [7]:
node_table['network_size'] = node_table.groupby('componentindex')['componentindex'].transform('count')


### Merge SIRIUS tables and dereplication tables

In [8]:
drep_df = pd.read_csv('Data/Dereplication/Capsaicinoid_dereplication/Output/Capsaicinoids_database_final.csv')

In [9]:
#merge CANOPUS predictions
sirius_merge = pd.merge(canopus, csi_fingerid, on='featureId', how='left')


#merge SIRIUS with dereplication table
derep_sirius=pd.merge(sirius_merge, drep_df, on='Molecular Formula', how='left')

#creats a column for a true or false for a match or not
derep_sirius['Data_Base_Match'] = sirius_merge['Molecular Formula'].isin(drep_df['Molecular Formula'])

derep_sirius

Unnamed: 0,NPC#pathway,NPC#pathway Probability,NPC#class,NPC#class Probability,featureId,Alkaloid_NPC,Capsaicins and Capsaicinoids,ConfidenceScore,CSI:FingerIDScore,ZodiacScore,...,compound_name_SIRIUS,SIRIUS_smiles,pubchemids_SIRIUS,Chemical Name,InChI Key,Isolated from Natural Source,References,Database_SMILES,Database_SMILES_First_Entry,Data_Base_Match
0,Amino acids and Peptides,0.682577,Aminoacids,0.707989,1,False,False,0.092395,-98.397856,0.965327,...,"2,2-Dinitropropanol",CC(CO)([N+](=O)[O-])[N+](=O)[O-],70194,,,,,,,False
1,Alkaloids,0.890238,Pyridine alkaloids,0.481792,3,True,False,,,,...,,,,,,,,,,False
2,Amino acids and Peptides,0.652401,Aminoacids,0.312094,6,False,False,,-202.400760,1.000000,...,"2-amino-2-oxo-1,5-dihydro-1,5,2",C1=CP(=O)(NC(=O)N1)N,328852;53245689;124364218,,,,,,,False
3,Alkaloids,0.542376,Polyamines,0.163301,10,True,False,0.878629,-9.840103,1.000000,...,Diolamine,C(CO)NCCO,8113;3736299;12212439;10374424;16121034;161214...,,,,,,,False
4,Alkaloids,0.593366,Fatty alcohols,0.660198,11,True,False,0.136142,-47.217135,1.000000,...,3-[bis(2-hydroxyethyl)amino]propan-1-ol,C(CN(CCO)CCO)CO,3034149,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295,Polyketides,0.813784,Open-chain polyketides,0.103341,1525,False,False,0.279357,-228.213546,0.996482,...,"[(3S,5R,8R,9R,10R,12R,13R,14R,17S)-12-acetylox...",CC(=O)OC1CCC2(C(C1(C)C)CCC3(C2CC(C4C3(CCC4C5(C...,11456204;21601586,,,,,,,False
1296,Fatty acids,0.980093,Triacylglycerols,0.287804,1526,False,False,,,,...,,,,,,,,,,False
1297,Terpenoids,0.477020,Cholane steroids,0.659215,1527,False,False,,,,...,,,,,,,,,,False
1298,Fatty acids,0.972856,Monoacylglycerols,0.684749,1529,False,False,0.305980,-31.091744,1.000000,...,Dodecyl hydrogen sebacate,CCCCCCCCCCCCOC(=O)CCCCCCCCC(=O)O,14420741,,,,,,,False


In [10]:
derep_sirius.to_csv('Data/cap_manu/output/derep_sirius_acquirex.csv', index=False)

## MZmine feature table
MZmine feature table (exported for FBMN) is imported. Optionally, a minimum area threshold is applied and entries below `intensity_threshold` are set to 0. 

In [15]:
#import
feat_table = pd.read_csv('Data/cap_manu/input/Acquire_X_Final_GNPS_quant.csv')

#exclude useless columns
col_names = ['row ID', 'row m/z', 'row retention time', 'correlation group ID', 'best ion', 'partners','neutral M mass'] + [col for col in feat_table.columns if "20230111" in col]
feat_table = feat_table[col_names]
feat_table.rename(columns={'row ID':'featureId', 'row m/z':'mz', 'row retention time':'RT', 'correlation group ID':'corrGroup_ID'}, inplace=True)

feat_table.columns = feat_table.columns.str.replace('20230111_', '')


feat_table

Unnamed: 0,featureId,mz,RT,corrGroup_ID,best ion,partners,neutral M mass,04_Lemon_Drop.mzML Peak area,03_Chilhuacle.mzML Peak area,05_Lemon_Drop.mzML Peak area,...,04_Fatalii White.mzML Peak area,22_Chilhuacle.mzML Peak area,06_Lemon_Drop.mzML Peak area,03_Carolina_Reaper.mzML Peak area,04_Chocolate_Habanero.mzML Peak area,05_Fatalii White.mzML Peak area,05_Carolina_Reaper.mzML Peak area,05_Chocolate_Habanero.mzML Peak area,03_Fatalii White.mzML Peak area,03_Chocolate_Habanero.mzML Peak area
0,1,278.897236,0.879991,,,,,0.00000,0.000,0.00000,...,0.0000,0.0000,21163.45000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000000e+00
1,2,104.106892,0.992013,,,,,17599.52300,0.000,14519.45900,...,0.0000,6091.2905,22390.36500,0.0000,0.0000,5531.8360,0.0000,0.0000,0.0000,0.000000e+00
2,3,322.870702,1.010316,21.0,,,,10754.38200,0.000,11095.93800,...,0.0000,0.0000,11397.10800,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000000e+00
3,4,177.948304,1.013060,21.0,,,,5444.72200,0.000,5770.36430,...,0.0000,0.0000,6195.12300,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000000e+00
4,5,211.931176,1.013060,21.0,,,,6610.74700,0.000,7051.51860,...,0.0000,0.0000,6706.76200,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1367,1368,736.541628,17.174862,135.0,[M+NH4]+,2544;2543,718.50764,0.00000,0.000,0.00000,...,0.0000,0.0000,0.00000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.049634e+06
1368,1369,765.576341,17.176315,261.0,,,,894.50214,4713.049,749.09174,...,4741.3594,0.0000,983.82825,2783.0586,0.0000,0.0000,2869.0063,5230.1997,0.0000,7.573346e+03
1369,1370,708.510338,17.183588,135.0,[M+NH4]+,2543;2545,690.47651,0.00000,0.000,0.00000,...,0.0000,0.0000,0.00000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2.416984e+06
1370,1371,611.477510,17.254670,,,,,0.00000,0.000,0.00000,...,1907.2585,0.0000,0.00000,4009.6470,6444.3633,1905.7039,3101.1006,6060.7060,2111.4026,5.906178e+03


### Merge MZmine feature table wiht GNPS node table and predictions from CSI:FingerID and CANOPUS

In [19]:
derep_sirius_feat = pd.merge(feat_table, derep_sirius, on='featureId', how='left')
final_table = pd.merge(derep_sirius_feat, node_table, on='featureId', how='left')
final_table

Unnamed: 0,featureId,mz,RT,corrGroup_ID,best ion,partners,neutral M mass,04_Lemon_Drop.mzML Peak area,03_Chilhuacle.mzML Peak area,05_Lemon_Drop.mzML Peak area,...,Analog:Smiles,cluster index,componentindex,GNPS_Compound_Name,MZErrorPPM,GNPS_Smiles,sum(precursor intensity),Log2_intensity,Log10_intensity,network_size
0,1,278.897236,0.879991,,,,,0.00000,0.000,0.00000,...,,1,-1,,,,4.027883e+04,15.297734,4.605077,550
1,2,104.106892,0.992013,,,,,17599.52300,0.000,14519.45900,...,,2,-1,,,,8.522713e+04,16.379025,4.930578,550
2,3,322.870702,1.010316,21.0,,,,10754.38200,0.000,11095.93800,...,,3,-1,,,,3.324743e+04,15.020955,4.521758,550
3,4,177.948304,1.013060,21.0,,,,5444.72200,0.000,5770.36430,...,,4,-1,,,,1.741021e+04,14.087646,4.240804,550
4,5,211.931176,1.013060,21.0,,,,6610.74700,0.000,7051.51860,...,,5,8,,,,2.036903e+04,14.314089,4.308970,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1367,1368,736.541628,17.174862,135.0,[M+NH4]+,2544;2543,718.50764,0.00000,0.000,0.00000,...,,1368,2,,,,1.049634e+06,20.001454,6.021038,14
1368,1369,765.576341,17.176315,261.0,,,,894.50214,4713.049,749.09174,...,,1369,-1,,,,5.077808e+04,15.631918,4.705676,550
1369,1370,708.510338,17.183588,135.0,[M+NH4]+,2543;2545,690.47651,0.00000,0.000,0.00000,...,,1370,126,,,,2.416984e+06,21.204777,6.383274,2
1370,1371,611.477510,17.254670,,,,,0.00000,0.000,0.00000,...,,1371,-1,,,,3.484072e+04,15.088487,4.542087,550


In [20]:
# derep_sirius_feat.to_csv('Data/cap_manu/output/derep_sirius_feat_acquirex.csv', index=False)
final_table.to_csv('Data/cap_manu/output/final_table_acquirex.csv', index=False)