In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors, rdMolDescriptors, QED
from collections import Counter

In [5]:
df = pd.read_csv("mtbs_tropical_annotations.tsv", sep="\t")
print(df.columns)
df.info

Index(['feature_id', 'component_id', 'libname', 'structure_inchikey',
       'structure_smiles', 'structure_molecular_formula',
       'structure_taxonomy_npclassifier_01pathway',
       'structure_taxonomy_npclassifier_02superclass',
       'structure_taxonomy_npclassifier_03class'],
      dtype='object')


<bound method DataFrame.info of        feature_id  component_id    libname           structure_inchikey  \
0           64000          2173  MS1_match  SRBFZHDQGSBBOR-HWQSCIPKSA-N   
1           64000          2173  MS1_match  PYMYPHUHKUWMLA-WDCZJNDASA-N   
2           63994          2138  MS1_match  GLDOVTGHNKAZLK-UHFFFAOYSA-N   
3           63993            47       ISDB  FZFFGBOPCQADGY-UHFFFAOYSA-N   
4           63991           664  MS1_match  KFXIUXCXSKTCNK-KLGAAMDDSA-N   
...           ...           ...        ...                          ...   
44977           6           152  MS1_match  HSPZTYQXYAPLOD-FICKGQQISA-N   
44978           6           152  MS1_match  JMOFRLILFOJJEZ-UHFFFAOYSA-N   
44979           6           152  MS1_match  IQFCUFCOJGKNJF-UHFFFAOYSA-N   
44980           6           152  MS1_match  DVILVCNQCLDLLF-HDSZUGLSSA-N   
44981           5          2285  MS1_match  IPCSVZSSVZVIGE-UHFFFAOYSA-N   

                                        structure_smiles  \
0      

In [7]:
df_clean = df.dropna(subset=['structure_smiles'])
df_clean = df_clean[df_clean['structure_smiles'] != 'nan']
df_clean = df_clean.drop_duplicates(subset=['structure_smiles'])

In [19]:
descr_list = ['MolWt', 'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors']
df_calc = pd.DataFrame()

def calc_descriptors(smiles, descr):
    mol = Chem.MolFromSmiles(smiles)
    return getattr(Descriptors, descr)(mol)

for descr in descr_list:
    df_calc[descr] = df_clean['structure_smiles'].apply(lambda s: calc_descriptors(s, descr))
df_calc['CalcFractionCSP3'] = df_clean['structure_smiles'].apply(lambda s: rdMolDescriptors.CalcFractionCSP3(Chem.MolFromSmiles(s)))
df_calc['qed'] = df_clean['structure_smiles'].apply(lambda s: QED.qed(Chem.MolFromSmiles(s)))
df_analysis = pd.concat([df_clean, df_calc], axis=1)
df_analysis.head()


Unnamed: 0,feature_id,component_id,libname,structure_inchikey,structure_smiles,structure_molecular_formula,structure_taxonomy_npclassifier_01pathway,structure_taxonomy_npclassifier_02superclass,structure_taxonomy_npclassifier_03class,MolWt,MolLogP,TPSA,NumHDonors,NumHAcceptors,CalcFractionCSP3,qed
0,64000,2173,MS1_match,SRBFZHDQGSBBOR-HWQSCIPKSA-N,OC1OC[C@H](O)[C@H](O)[C@H]1O,C5H10O5,Carbohydrates,Saccharides,Monosaccharides,150.13,-2.5823,90.15,4,5,1.0,0.30425
1,64000,2173,MS1_match,PYMYPHUHKUWMLA-WDCZJNDASA-N,O=C[C@@H](O)[C@H](O)[C@H](O)CO,C5H10O5,Carbohydrates,Saccharides,Monosaccharides,150.13,-2.7397,97.99,4,5,0.8,0.325838
2,63994,2138,MS1_match,GLDOVTGHNKAZLK-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCO,C18H38O,Fatty acids,Fatty acyls,Fatty alcohols,270.501,6.2402,20.23,1,1,1.0,0.329061
3,63993,47,ISDB,FZFFGBOPCQADGY-UHFFFAOYSA-N,CC(C)=CCCC(C)=CCC1(CC=C(C)C)C(=O)C(O)=Cc2oc3cc...,C28H32O6,Shikimates and Phenylpropanoids,Xanthones,Plant xanthones,464.558,6.3627,107.97,3,6,0.357143,0.411398
4,63991,664,MS1_match,KFXIUXCXSKTCNK-KLGAAMDDSA-N,C=C1C2=Nc3ccccc3[C@@]23CCN2C/C(=C/C)[C@H]1C[C@...,C19H20N2,Terpenoids,Diterpenoids,Valparane diterpenoids,276.383,3.6208,15.6,0,2,0.421053,0.660796


In [None]:
# only need to focus on top 5, otherwise there will be too many colors and less meaningful information
top5 = df_analysis['structure_taxonomy_npclassifier_01pathway'].value_counts().nlargest(5).index
df_analysis['category'] = df_analysis['structure_taxonomy_npclassifier_01pathway'].apply(lambda s: s if s in top5 else 'other')
df_analysis.head()

Unnamed: 0,feature_id,component_id,libname,structure_inchikey,structure_smiles,structure_molecular_formula,structure_taxonomy_npclassifier_01pathway,structure_taxonomy_npclassifier_02superclass,structure_taxonomy_npclassifier_03class,MolWt,MolLogP,TPSA,NumHDonors,NumHAcceptors,CalcFractionCSP3,qed,category
0,64000,2173,MS1_match,SRBFZHDQGSBBOR-HWQSCIPKSA-N,OC1OC[C@H](O)[C@H](O)[C@H]1O,C5H10O5,Carbohydrates,Saccharides,Monosaccharides,150.13,-2.5823,90.15,4,5,1.0,0.30425,other
1,64000,2173,MS1_match,PYMYPHUHKUWMLA-WDCZJNDASA-N,O=C[C@@H](O)[C@H](O)[C@H](O)CO,C5H10O5,Carbohydrates,Saccharides,Monosaccharides,150.13,-2.7397,97.99,4,5,0.8,0.325838,other
2,63994,2138,MS1_match,GLDOVTGHNKAZLK-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCO,C18H38O,Fatty acids,Fatty acyls,Fatty alcohols,270.501,6.2402,20.23,1,1,1.0,0.329061,Fatty acids
3,63993,47,ISDB,FZFFGBOPCQADGY-UHFFFAOYSA-N,CC(C)=CCCC(C)=CCC1(CC=C(C)C)C(=O)C(O)=Cc2oc3cc...,C28H32O6,Shikimates and Phenylpropanoids,Xanthones,Plant xanthones,464.558,6.3627,107.97,3,6,0.357143,0.411398,Shikimates and Phenylpropanoids
4,63991,664,MS1_match,KFXIUXCXSKTCNK-KLGAAMDDSA-N,C=C1C2=Nc3ccccc3[C@@]23CCN2C/C(=C/C)[C@H]1C[C@...,C19H20N2,Terpenoids,Diterpenoids,Valparane diterpenoids,276.383,3.6208,15.6,0,2,0.421053,0.660796,Terpenoids
