### The Data
This book will manipulate the molecular_structures.csv file to create even better features for the different 1JHX models


In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

input_folder = './input'

structures = pd.read_csv(f'{input_folder}/structures.csv')
molecular_structures = pd.read_csv(f'{input_folder}/molecule_structures.csv')


In [2]:
structures.loc[structures['molecule_name'] == 'dsgdb9nsd_000011']

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
46,dsgdb9nsd_000011,0,C,-0.002945,1.509914,0.008673
47,dsgdb9nsd_000011,1,C,0.026083,0.003276,-0.037459
48,dsgdb9nsd_000011,2,O,0.942288,-0.65507,-0.456826
49,dsgdb9nsd_000011,3,H,0.922788,1.926342,-0.391466
50,dsgdb9nsd_000011,4,H,-0.862015,1.878525,-0.564795
51,dsgdb9nsd_000011,5,H,-0.150506,1.843934,1.042891
52,dsgdb9nsd_000011,6,H,-0.89443,-0.486434,0.357749


In [3]:
f'Structures shape: {structures.shape}'

'Structures shape: (2358657, 6)'

In [4]:
molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000050']

Unnamed: 0,molecule_name,atom_0,atom_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
246,dsgdb9nsd_000050,1,0,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372028
247,dsgdb9nsd_000050,4,0,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372027
248,dsgdb9nsd_000050,5,0,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.004448
249,dsgdb9nsd_000050,2,1,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
250,dsgdb9nsd_000050,6,1,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.07811
251,dsgdb9nsd_000050,7,2,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
252,dsgdb9nsd_000050,3,2,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.423545
253,dsgdb9nsd_000050,8,3,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
254,dsgdb9nsd_000050,4,3,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
255,dsgdb9nsd_000050,9,4,"{0, 3}","{3: {2}, 0: {1}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.078109


In [5]:
f'Molecular_structures shape: {molecular_structures.shape}'

'Molecular_structures shape: (1586325, 32)'

In [6]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)

    df = df.drop('x', axis=1)
    df = df.drop('y', axis=1)
    df = df.drop('z', axis=1)

    df = df.rename(columns={'atom': f'atom_symbol_{atom_idx}'})
    
    return df

In [7]:
molecular_structures = map_atom_info(molecular_structures, 0)
molecular_structures = map_atom_info(molecular_structures, 1)

In [9]:
list_order = ['molecule_name', 'atom_0',  'atom_symbol_0', 'atom_1', 'atom_symbol_1', 'atom_1_1_level_NB',
       'atom_1_2_level_NB', 'atom_0_CH3', 'atom_1_CH3', 'atom_0_CH2',
       'atom_1_CH2', 'atom_0_CH1', 'atom_1_CH1', 'atom_0_CH0', 'atom_1_CH0',
       'atom_0_NH2', 'atom_1_NH2', 'atom_0_NH1', 'atom_1_NH1', 'atom_0_NH0',
       'atom_1_NH0', 'atom_0_OH1', 'atom_1_OH1', 'atom_0_OH0', 'atom_1_OH0',
       'is_in_ring', 'has_aromatic_bond', 'is_primary_amide',
       'is_secondary_amide', 'is_tertiary_amide', 'is_ester', 'is_carbonyl',
       'bond_order', 'bond_length']

molecular_structures = molecular_structures[list_order]

#### Creating new features

Find neighbour atoms and functional groups

In [10]:
molecular_structures.head()

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
0,dsgdb9nsd_000001,3,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091946
1,dsgdb9nsd_000001,2,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952
2,dsgdb9nsd_000001,1,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953
3,dsgdb9nsd_000001,4,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091948
4,dsgdb9nsd_000002,3,H,0,N,set(),{},0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.017208


In [44]:
df_length = molecular_structures.shape[0]

molecular_structures['NB_left_atom'] = ['']*df_length
molecular_structures['NB_left_group'] = ['']*df_length

molecular_structures['NB_center_atom'] = ['']*df_length
molecular_structures['NB_center_group'] = ['']*df_length

molecular_structures['NB_right_atom'] = ['']*df_length
molecular_structures['NB_right_group'] = ['']*df_length

molecular_structures['NB_all_groups'] = ['']*df_length

molecular_structures['NB_in_ring'] = [0]*df_length
molecular_structures['NB_is_aromatic'] = [0]*df_length

molecular_structures['NB_is_primary_amide'] = [0]*df_length
molecular_structures['NB_is_secondary_amide'] = [0]*df_length
molecular_structures['NB_is_tertiary_amide'] = [0]*df_length
molecular_structures['NB_is_ester'] = [0]*df_length
molecular_structures['NB_is_carbonyl'] = [0]*df_length
molecular_structures['NB_is_hydroxyl'] = [0]*df_length

molecular_structures.head()

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,NB_left_atom,NB_left_group,NB_center_atom,NB_center_group,NB_right_atom,NB_right_group,NB_all_groups,NB_in_ring,NB_is_aromatic,NB_is_primary_amide,NB_is_secondary_amide,NB_is_tertiary_amide,NB_is_ester,NB_is_carbonyl,NB_is_hydroxyl
0,dsgdb9nsd_000001,3,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091946,,,,,,,,0,0,0,0,0,0,0,0
1,dsgdb9nsd_000001,2,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952,,,,,,,,0,0,0,0,0,0,0,0
2,dsgdb9nsd_000001,1,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953,,,,,,,,0,0,0,0,0,0,0,0
3,dsgdb9nsd_000001,4,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091948,,,,,,,,0,0,0,0,0,0,0,0
4,dsgdb9nsd_000002,3,H,0,N,set(),{},0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.017208,,,,,,,,0,0,0,0,0,0,0,0


In [45]:
#molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000050']
selected_molecules = molecular_structures.iloc[0:1600]

In [None]:
# For testing below functionality

molecular_structures['NB_left_atom'] = ['']*df_length
molecular_structures['NB_left_group'] = ['']*df_length

molecular_structures['NB_center_atom'] = ['']*df_length
molecular_structures['NB_center_group'] = ['']*df_length

molecular_structures['NB_right_atom'] = ['']*df_length
molecular_structures['NB_right_group'] = ['']*df_length

molecular_structures['NB_all_groups'] = ['']*df_length


In [51]:
from collections import defaultdict

molecular_structures_grouped = selected_molecules.groupby('molecule_name')


def mark_functional_groups(update_map, nb_row):
    #print(f'~~~~~~~~~~~~ update_map: {update_map}')
    if nb_row['atom_1_CH3'] == 1:
        update_map['main_group'] = 'CH3'
    elif nb_row['atom_1_CH2'] == 1:
        update_map['main_group'] = 'CH2'
    elif nb_row['atom_1_CH1'] == 1:
        update_map['main_group'] = 'CH1'
    elif nb_row['atom_1_CH0'] == 1:
        update_map['main_group'] = 'CH0'
        
    elif nb_row['atom_1_NH2'] == 1:
        update_map['main_group'] = 'NH2'
    elif nb_row['atom_1_NH1'] == 1:
        update_map['main_group'] = 'NH1'
    elif nb_row['atom_1_NH0'] == 1:
        update_map['main_group'] = 'NH0'
        
    elif nb_row['atom_1_OH1'] == 1:
        update_map['main_group'] = 'OH1'
        update_map['is_hydroxyl'] = 1        
    elif nb_row['atom_1_OH0'] == 1:
        update_map['main_group'] = 'OH0'
    
    if nb_row['is_in_ring'] == 1:
        update_map['in_ring'] = 1 

    if nb_row['has_aromatic_bond'] == 1:
        update_map['is_aromatic'] = 1 
    
    if nb_row['is_primary_amide'] == 1:
        update_map['is_primary_amide'] = 1 
    if nb_row['is_secondary_amide'] == 1:
        update_map['is_secondary_amide'] = 1 
    if nb_row['is_tertiary_amide'] == 1:
        update_map['is_tertiary_amide'] = 1 

    if nb_row['is_ester'] == 1:
        update_map['is_ester'] = 1 

    if nb_row['is_carbonyl'] == 1:
        update_map['is_carbonyl'] = 1 
    
    return update_map


def update_row(where, index, nb_indexes_list):
    # where have values -1: left; 0: center; 1: right
    #print(f'*** nb_indexes_list: {nb_indexes_list}')

    if nb_indexes_list == 0:
        return;
    
    update_map = {'atom': '', 'main_group': '', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0,
                 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
    
    for nb_index in nb_indexes_list:
        nb_row = molecular_structures.iloc[nb_index]
        update_map['atom'] = nb_row['atom_symbol_1']
        update_map = mark_functional_groups(update_map, nb_row)
    
    print(update_map)
    
    if where == -1:
        #print(f'LEFT: atom={update_map["atom"]}, main_group={update_map["main_group"]}')
        #print(f'Index: {index}')
        molecular_structures.at[index, 'NB_left_atom'] = update_map['atom']
        molecular_structures.at[index, 'NB_left_group'] = update_map['main_group']
    elif where == 0:
        #print(f'CENTER: atom={update_map["atom"]}, main_group={update_map["main_group"]}')
        molecular_structures.at[index, 'NB_center_atom'] = update_map['atom']
        molecular_structures.at[index, 'NB_center_group'] = update_map['main_group']
    elif where == 1:
        #print(f'RIGHT: atom={update_map["atom"]}, main_group={update_map["main_group"]}')
        molecular_structures.at[index, 'NB_right_atom'] = update_map['atom']
        molecular_structures.at[index, 'NB_right_group'] = update_map['main_group']
    
    if molecular_structures.at[index, 'NB_in_ring'] == 0:
        molecular_structures.at[index, 'NB_in_ring'] = update_map['in_ring']
    if molecular_structures.at[index, 'NB_is_aromatic'] == 0:                                           
        molecular_structures.at[index, 'NB_is_aromatic'] = update_map['is_aromatic']
    if molecular_structures.at[index, 'NB_is_primary_amide'] == 0:
        molecular_structures.at[index, 'NB_is_primary_amide'] = update_map['is_primary_amide']
    if molecular_structures.at[index, 'NB_is_secondary_amide'] == 0:
        molecular_structures.at[index, 'NB_is_secondary_amide'] = update_map['is_secondary_amide']
    if molecular_structures.at[index, 'NB_is_tertiary_amide'] == 0:                                                    
        molecular_structures.at[index, 'NB_is_tertiary_amide'] = update_map['is_tertiary_amide']
    if molecular_structures.at[index, 'NB_is_ester'] == 0:
        molecular_structures.at[index, 'NB_is_ester'] = update_map['is_ester']
    if molecular_structures.at[index, 'NB_is_carbonyl'] == 0:                                                    
        molecular_structures.at[index, 'NB_is_carbonyl'] = update_map['is_carbonyl']
    if molecular_structures.at[index, 'NB_is_hydroxyl'] == 0:
        molecular_structures.at[index, 'NB_is_hydroxyl'] = update_map['is_hydroxyl']
              
    
exclude_char = [' ', ',']
    
def turn_str_into_int_list(string):
    print(string)
    if not string:
        return []
    if 's' in string:
        return []
    
    int_list = []
    for c in string[1:-1]:
        if c not in exclude_char:
                int_list.append(int(c))
    return int_list


for name, group in molecular_structures_grouped:  # PER MOLECULE

    group_dict = defaultdict(int)
    
    # FIND INDEXES FOR ATOMS IN MOLECULE_STRUCTURES
    for index, row in group.iterrows(): 
        if row.atom_1 in group_dict:
            group_dict[row.atom_1].append(index)
        else:
            group_dict[row.atom_1] = [index]

    print(group_dict)    
    
    for index, row in group.iterrows(): # PER ROW IN MOLECULE
        print(f'Index: {index}', row.atom_symbol_0) 
        # Only work of H-X bonds, that is, decorate those with extra features
        if row.atom_symbol_0 == 'H':            
            parent_atom_index = row.atom_1
            #print(f'Atom 0 is Hydrogen it is connected to {parent_atom_index} : {row.atom_symbol_1}')

            no_of_NBs = turn_str_into_int_list(row.atom_1_1_level_NB)
            no_of_NBs_length = len(no_of_NBs)
            #print(f'Neighbour atoms indexes: {no_of_NBs}, {no_of_NBs_length}')

            if no_of_NBs_length == 1:
                key_list = group_dict[no_of_NBs[0]]
                #print(f'Atom has 1 neighbour: {key_list}')
                if no_of_NBs[0] < parent_atom_index:
                    # update left based column
                    update_row(-1, index, key_list)
                else:
                    # update right based column
                    update_row(1, index, key_list)
            elif no_of_NBs_length == 2:
                key_list_0 = group_dict[no_of_NBs[0]]
                key_list_1 = group_dict[no_of_NBs[1]]
                #print(f'Atom has 2 neighbours, 2a: {key_list_0}')
                #print(f'Atom has 2 neighbours, 2b: {key_list_1}')
                if  no_of_NBs[0] < parent_atom_index:
                    # update left based column
                    update_row(-1, index, key_list_0)
                    if no_of_NBs[1] <= parent_atom_index:
                        update_row(0, index, key_list_1)
                    else:
                        update_row(1, index, key_list_1)                                
                else:
                    # update cemter and right based columns
                    update_row(0, index, key_list_0)
                    update_row(1, index, key_list_1)                                
            elif no_of_NBs_length == 3:
                key_list_0 = group_dict[no_of_NBs[0]]
                key_list_1 = group_dict[no_of_NBs[1]]
                key_list_2 = group_dict[no_of_NBs[2]]
                #print(f'Atom has 3 neighbours, 3a: {key_list_0}')
                #print(f'Atom has 3 neighbour, 3b: {key_list_1}')
                #print(f'Atom has 3 neighbour, 3c: {key_list_2}')
                update_row(-1, index, key_list_0)
                update_row(0, index, key_list_1)                                
                update_row(1, index, key_list_2)
 

defaultdict(<class 'int'>, {0: [0, 1, 2, 3]})
Index: 0 H
set()
Index: 1 H
set()
Index: 2 H
set()
Index: 3 H
set()
defaultdict(<class 'int'>, {0: [4, 5, 6]})
Index: 4 H
set()
Index: 5 H
set()
Index: 6 H
set()
defaultdict(<class 'int'>, {0: [7, 8]})
Index: 7 H
set()
Index: 8 H
set()
defaultdict(<class 'int'>, {0: [9, 10]})
Index: 9 N
Index: 10 H
{1}
defaultdict(<class 'int'>, {0: [11, 12, 13, 14], 1: [15, 16, 17]})
Index: 11 H
{1}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 12 C
Index: 13 H
{1}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 14 H
{1}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_

{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 96 H
{1}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 97 H
{1}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 98 H
{1}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 99 H
{1}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_

{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 188 C
Index: 189 H
{2}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 190 H
{2}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 191 H
{2}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_

Index: 262 N
Index: 263 C
Index: 264 H
{0, 3}
{'atom': 'N', 'main_group': 'NH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'N', 'main_group': 'NH0', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
defaultdict(<class 'int'>, {0: [265, 266], 1: [267, 268], 2: [269, 270], 3: [271, 272], 4: [273]})
Index: 265 C
Index: 266 C
Index: 267 C
Index: 268 H
{0, 2}
{'atom': 'O', 'main_group': 'OH0', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 269 H
{1, 3}
{'atom': 'C', 'm

{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 350 C
Index: 351 H
{1}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 352 C
Index: 353 O
Index: 354 H
{0, 2, 3}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'O', 'main_group': 'OH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 1}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_car

Index: 432 H
{3}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
defaultdict(<class 'int'>, {0: [433, 434, 435, 436], 1: [437, 438, 439], 2: [440, 441, 442], 3: [443, 444, 445], 4: [446, 447, 448]})
Index: 433 H
{1}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 434 H
{1}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 435 C
Index: 436 H
{1}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 437 H
{0, 2}
{'ato

{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 500 C
Index: 501 H
{1, 3}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 502 H
{1, 3}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 

{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
defaultdict(<class 'int'>, {0: [570, 571], 1: [572, 573, 574], 2: [575, 576, 577], 3: [578], 4: [579, 580]})
Index: 570 H
{1}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 571 C
Index: 572 H
{0, 2, 4}
{'atom': 'O', 'main_group': 'OH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 1}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary

Index: 638 C
Index: 639 H
{0, 2, 4}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 640 C
Index: 641 C
Index: 642 H
{1, 3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_

{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 717 C
Index: 718 C
Index: 719 H
{2, 4}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'O', 'main_group': 'OH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 1}
Index: 720 O
Index: 721 H
{2, 4}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'O', 'main_group': 'OH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_car

{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 800 H
{0, 2}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'N', 'main_group': 'NH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 1, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 801 N
Index: 802 H
{0, 2}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'N', 'main_group': 'NH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 1, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 

{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 873 C
Index: 874 O
Index: 875 H
{1, 3}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'O', 'main_group': 'OH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 876 H
{1, 3}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'O', 'main_group': 'OH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'i

{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 952 C
Index: 953 C
Index: 954 C
Index: 955 H
{2, 4}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 956 H
{2, 4}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_car

defaultdict(<class 'int'>, {0: [1020, 1021, 1022, 1023], 1: [1024], 2: [1025, 1026, 1027], 3: [1028, 1029, 1030], 4: [1031, 1032]})
Index: 1020 H
{1}
{'atom': 'O', 'main_group': 'OH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1021 O
Index: 1022 H
{1}
{'atom': 'O', 'main_group': 'OH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1023 H
{1}
{'atom': 'O', 'main_group': 'OH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1024 C
Index: 1025 C
Index: 1026 H
{1, 3, 4}
{'atom': 'O', 'main_group': 'OH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hy

{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1096 C
Index: 1097 C
Index: 1098 H
{1, 3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1099 C
Index: 1100 H
{1, 3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'i

Index: 1186 H
{1}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1187 C
Index: 1188 H
{1}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1189 N
Index: 1190 C
Index: 1191 O
Index: 1192 H
{1, 3}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'O', 'main_group': 'OH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 1}
Index: 1193 H
{1, 3}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_am

Index: 1290 C
Index: 1291 H
{3, 5}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1292 O
defaultdict(<class 'int'>, {0: [1293, 1294], 1: [1295, 1296], 2: [1297, 1298], 3: [1299, 1300], 4: [1301, 1302], 5: [1303]})
Index: 1293 H
{1}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1294 C
Index: 1295 C
Index: 1296 N
Index: 1297 H
{1, 3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is

Index: 1379 H
{2, 4}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1380 C
Index: 1381 H
{3, 5}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1382 C
Index: 1383 H
{0, 4}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 1, 'is_aromatic': 1, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester'

{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1458 H
{1}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1459 C
Index: 1460 H
{1}
{'atom': 'C', 'main_group': 'CH2', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1461 H
{0, 2}
{'atom': 'C', 'main_group': 'CH3', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 

{3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1541 H
{3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1542 H
{3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
Index: 1543 H
{3}
{'atom': 'C', 'main_group': 'CH1', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0, 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
defaultdict(<class 'int'>, {0: [1544, 1545, 1546, 1547], 1: [1548, 1549], 3: [1550, 1551, 1552], 4: [1553]})
Index: 1544 H
{1}
{'atom': 'C', 'main_group': 'CH0', 'in_ring': 0, 'is_ar

In [52]:
molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000011']

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,NB_left_atom,NB_left_group,NB_center_atom,NB_center_group,NB_right_atom,NB_right_group,NB_all_groups,NB_in_ring,NB_is_aromatic,NB_is_primary_amide,NB_is_secondary_amide,NB_is_tertiary_amide,NB_is_ester,NB_is_carbonyl,NB_is_hydroxyl
34,dsgdb9nsd_000011,4,H,0,C,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.096696,,,,,C,CH1,,0,0,0,0,0,0,1,0
35,dsgdb9nsd_000011,3,H,0,C,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091103,,,,,C,CH1,,0,0,0,0,0,0,1,0
36,dsgdb9nsd_000011,1,C,0,C,{1},{1: {2}},0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.507624,,,,,,,,0,0,0,0,0,0,0,0
37,dsgdb9nsd_000011,5,H,0,C,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.096791,,,,,C,CH1,,0,0,0,0,0,0,1,0
38,dsgdb9nsd_000011,2,O,1,C,"{0, 2}","{2: set(), 0: set()}",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,2,1.203628,,,,,,,,0,0,0,0,0,0,0,0
39,dsgdb9nsd_000011,6,H,1,C,"{0, 2}","{2: set(), 0: set()}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.115056,C,CH3,,,,,,0,0,0,0,0,0,0,0


In [53]:
def calculate(row):
    NB_left_group = row['NB_left_group']
    NB_center_group = row['NB_center_group']
    NB_right_group = row['NB_right_group']
    
    entries = []
    if NB_left_group:
        entries.append(NB_left_group)
    if NB_center_group:
        entries.append(NB_center_group)
    if NB_right_group:
        entries.append(NB_right_group)
    
    entries.sort()
    
    str_representation = "-".join(entries)
    return str_representation

In [54]:
%%time

molecular_structures['NB_all_groups'] = molecular_structures.apply(lambda row : calculate(row), axis = 1)

Wall time: 1min 14s


In [57]:
# Let's test it is correct

df_test = molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000010']

selected_columns = ['atom_0', 'atom_symbol_0', 'atom_1', 'atom_symbol_1', 'atom_1_1_level_NB', 'NB_left_group', 
                    'NB_center_group', 'NB_right_group', 'NB_all_groups', 'NB_in_ring', 'NB_is_aromatic', 
                    'NB_is_primary_amide', 'NB_is_secondary_amide', 'NB_is_tertiary_amide', 'NB_is_ester', 
                    'NB_is_carbonyl','NB_is_hydroxyl']

df_test[selected_columns]

Unnamed: 0,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,NB_left_group,NB_center_group,NB_right_group,NB_all_groups,NB_in_ring,NB_is_aromatic,NB_is_primary_amide,NB_is_secondary_amide,NB_is_tertiary_amide,NB_is_ester,NB_is_carbonyl,NB_is_hydroxyl
29,4,H,0,C,{1},,,CH0,CH0,0,0,0,0,0,0,0,0
30,1,C,0,C,{1},,,,,0,0,0,0,0,0,0,0
31,3,H,0,C,{1},,,CH0,CH0,0,0,0,0,0,0,0,0
32,5,H,0,C,{1},,,CH0,CH0,0,0,0,0,0,0,0,0
33,2,N,1,C,"{0, 2}",,,,,0,0,0,0,0,0,0,0
