# Demo: collecting statistics of the functional groups (or molecular features) used by SIMPOL 

## Option 1. Running the substructure.py script

In [1]:
run ../scripts/aprl_ssp/substructure_search.py -d -g SIMPOLgroups_sane.csv -i ../data/validation/apinenemech.csv  -o ../data/output/apinenemech_SIMPOLgroups_1.csv

## Option 2. Importing searchgroups from util and running it in the notebook

Load modules

In [2]:
import sys
import os 
import pandas
import matplotlib.pyplot as plt
sys.path.append('../scripts/')
from aprl_ssp.util import searchgroups

Define list of SIMPOL groups from groupfile

In [3]:
groups = pd.read_csv('../scripts/aprl_ssp/SMARTSpatterns/SIMPOLgroups_sane.csv').set_index('substructure')

Load a list of SMILES strings. Each SMILES string is associated with a compound label too

In [4]:
data = pd.read_csv('../data/validation/apinenemech.csv')
data.columns

Index(['compound', 'SMILES'], dtype='object')

Calculate how many times each functional group/molecular feature used by SIMPOL appears in each molecule

In [5]:
output_file = '../data/output/apinenemech_SIMPOLgroups_2.csv'
#Define which groups for which stats should be exported to output_file. If none everything will be exported. 
export = None
#Rearrange input to fit the program
inp = data.set_index('compound')
###_* --- Apply search function
search = searchgroups(groups.pattern, export) 
output = count_groups(inp, search)

###_* --- Export to output
output.to_csv(output_file, index_label='compound')

Look at output

In [6]:
output

substructure,"amine, primary","amine, secondary","amine, tertiary",alkane CH,alkene CH,aromatic CH,carbonyl,hydroxyl (alkyl),carboxylic acid,"ester, all",...,carbonylperoxyacid,peroxy nitrate,carbon number,"ether, aromatic",ether (alicyclic),"amine, aromatic",nitroester,C=C-C=O in non-aromatic ring,C=C (non-aromatic),nC-OHside-a
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C7PAN3,0,0,0,7,0,0,3,0,0,0,...,0,0,7,0,0,0,0,0,0,0
C4PAN6,0,0,0,4,0,0,1,1,0,0,...,0,0,4,0,0,0,0,0,0,0
NC72O,0,0,0,6,0,0,3,0,0,0,...,0,0,7,0,0,0,0,0,0,0
HCC7CO,0,0,0,8,1,0,1,1,0,0,...,0,0,7,0,0,0,0,0,1,0
CH3NO3,0,0,0,3,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
APINCO2,0,0,0,15,1,0,0,1,0,0,...,0,0,10,0,0,0,0,0,1,0
CH3COCH2O2,0,0,0,5,0,0,1,0,0,0,...,0,0,3,0,0,0,0,0,0,0
C511OOH,0,0,0,6,0,0,2,0,0,0,...,0,0,5,0,0,0,0,0,0,0
CH3CO3H,0,0,0,3,0,0,0,0,0,0,...,1,0,2,0,0,0,0,0,0,0


The output dataframe: each row corresponds to an instance, and each column corresponds to a group/feature used in SIMPOL.
The numbers in the frame correspond to a count of the number of times each group/feature appears.