### The Data
This book will manipulate the molecular_structures.csv file to create even better features for the different 1JHX models


In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

input_folder = './input'

structures = pd.read_csv(f'{input_folder}/structures.csv')
molecular_structures = pd.read_csv(f'{input_folder}/molecule_structures.csv')


In [2]:
structures.loc[structures['molecule_name'] == 'dsgdb9nsd_000011']

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
46,dsgdb9nsd_000011,0,C,-0.002945,1.509914,0.008673
47,dsgdb9nsd_000011,1,C,0.026083,0.003276,-0.037459
48,dsgdb9nsd_000011,2,O,0.942288,-0.65507,-0.456826
49,dsgdb9nsd_000011,3,H,0.922788,1.926342,-0.391466
50,dsgdb9nsd_000011,4,H,-0.862015,1.878525,-0.564795
51,dsgdb9nsd_000011,5,H,-0.150506,1.843934,1.042891
52,dsgdb9nsd_000011,6,H,-0.89443,-0.486434,0.357749


In [3]:
f'Structures shape: {structures.shape}'

'Structures shape: (2358657, 6)'

In [4]:
molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000050']

Unnamed: 0,molecule_name,atom_0,atom_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
246,dsgdb9nsd_000050,1,0,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372028
247,dsgdb9nsd_000050,4,0,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372027
248,dsgdb9nsd_000050,5,0,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.004448
249,dsgdb9nsd_000050,2,1,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
250,dsgdb9nsd_000050,6,1,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.07811
251,dsgdb9nsd_000050,7,2,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
252,dsgdb9nsd_000050,3,2,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.423545
253,dsgdb9nsd_000050,8,3,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
254,dsgdb9nsd_000050,4,3,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
255,dsgdb9nsd_000050,9,4,"{0, 3}","{3: {2}, 0: {1}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.078109


In [5]:
f'Molecular_structures shape: {molecular_structures.shape}'

'Molecular_structures shape: (1586325, 32)'

In [6]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)

    df = df.drop('x', axis=1)
    df = df.drop('y', axis=1)
    df = df.drop('z', axis=1)

    df = df.rename(columns={'atom': f'atom_symbol_{atom_idx}'})
    
    return df

In [7]:
molecular_structures = map_atom_info(molecular_structures, 0)
molecular_structures = map_atom_info(molecular_structures, 1)

In [8]:
list_order = ['molecule_name', 'atom_0',  'atom_symbol_0', 'atom_1', 'atom_symbol_1', 'atom_1_1_level_NB',
       'atom_1_2_level_NB', 'atom_0_CH3', 'atom_1_CH3', 'atom_0_CH2',
       'atom_1_CH2', 'atom_0_CH1', 'atom_1_CH1', 'atom_0_CH0', 'atom_1_CH0',
       'atom_0_NH2', 'atom_1_NH2', 'atom_0_NH1', 'atom_1_NH1', 'atom_0_NH0',
       'atom_1_NH0', 'atom_0_OH1', 'atom_1_OH1', 'atom_0_OH0', 'atom_1_OH0',
       'is_in_ring', 'has_aromatic_bond', 'is_primary_amide',
       'is_secondary_amide', 'is_tertiary_amide', 'is_ester', 'is_carbonyl',
       'bond_order', 'bond_length']

molecular_structures = molecular_structures[list_order]

#### Creating new features

Find neighbour atoms and functional groups

In [9]:
molecular_structures.head()

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
0,dsgdb9nsd_000001,3,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091946
1,dsgdb9nsd_000001,2,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952
2,dsgdb9nsd_000001,1,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953
3,dsgdb9nsd_000001,4,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091948
4,dsgdb9nsd_000002,3,H,0,N,set(),{},0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.017208


In [10]:
df_length = molecular_structures.shape[0]

molecular_structures['NB_left_atom'] = ['']*df_length
molecular_structures['NB_left_group'] = ['']*df_length

molecular_structures['NB_center_atom'] = ['']*df_length
molecular_structures['NB_center_group'] = ['']*df_length

molecular_structures['NB_right_atom'] = ['']*df_length
molecular_structures['NB_right_group'] = ['']*df_length

molecular_structures['NB_all_groups'] = ['']*df_length

molecular_structures['NB_in_ring'] = [0]*df_length
molecular_structures['NB_is_aromatic'] = [0]*df_length

molecular_structures['NB_is_primary_amide'] = [0]*df_length
molecular_structures['NB_is_secondary_amide'] = [0]*df_length
molecular_structures['NB_is_tertiary_amide'] = [0]*df_length
molecular_structures['NB_is_ester'] = [0]*df_length
molecular_structures['NB_is_carbonyl'] = [0]*df_length
molecular_structures['NB_is_hydroxyl'] = [0]*df_length

molecular_structures.head()

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,NB_left_atom,NB_left_group,NB_center_atom,NB_center_group,NB_right_atom,NB_right_group,NB_all_groups,NB_in_ring,NB_is_aromatic,NB_is_primary_amide,NB_is_secondary_amide,NB_is_tertiary_amide,NB_is_ester,NB_is_carbonyl,NB_is_hydroxyl
0,dsgdb9nsd_000001,3,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091946,,,,,,,,0,0,0,0,0,0,0,0
1,dsgdb9nsd_000001,2,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952,,,,,,,,0,0,0,0,0,0,0,0
2,dsgdb9nsd_000001,1,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953,,,,,,,,0,0,0,0,0,0,0,0
3,dsgdb9nsd_000001,4,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091948,,,,,,,,0,0,0,0,0,0,0,0
4,dsgdb9nsd_000002,3,H,0,N,set(),{},0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.017208,,,,,,,,0,0,0,0,0,0,0,0


In [11]:
from collections import defaultdict

molecular_structures_grouped = molecular_structures.groupby('molecule_name')


def mark_functional_groups(update_map, nb_row, use_atom_1):
    if use_atom_1:
        if nb_row['atom_1_CH3'] == 1:
            update_map['main_group'] = 'CH3'
        elif nb_row['atom_1_CH2'] == 1:
            update_map['main_group'] = 'CH2'
        elif nb_row['atom_1_CH1'] == 1:
            update_map['main_group'] = 'CH1'
        elif nb_row['atom_1_CH0'] == 1:
            update_map['main_group'] = 'CH0'

        elif nb_row['atom_1_NH2'] == 1:
            update_map['main_group'] = 'NH2'
        elif nb_row['atom_1_NH1'] == 1:
            update_map['main_group'] = 'NH1'
        elif nb_row['atom_1_NH0'] == 1:
            update_map['main_group'] = 'NH0'

        elif nb_row['atom_1_OH1'] == 1:
            update_map['main_group'] = 'OH1'
            update_map['is_hydroxyl'] = 1        
        elif nb_row['atom_1_OH0'] == 1:
            update_map['main_group'] = 'OH0'

        if nb_row['is_in_ring'] == 1:
            update_map['in_ring'] = 1 

        if nb_row['has_aromatic_bond'] == 1:
            update_map['is_aromatic'] = 1 

        if nb_row['is_primary_amide'] == 1:
            update_map['is_primary_amide'] = 1 
        if nb_row['is_secondary_amide'] == 1:
            update_map['is_secondary_amide'] = 1 
        if nb_row['is_tertiary_amide'] == 1:
            update_map['is_tertiary_amide'] = 1 

        if nb_row['is_ester'] == 1:
            update_map['is_ester'] = 1 

        if nb_row['is_carbonyl'] == 1:
            update_map['is_carbonyl'] = 1 
    else:
        if nb_row['atom_0_CH3'] == 1:
            update_map['main_group'] = 'CH3'
        elif nb_row['atom_0_CH2'] == 1:
            update_map['main_group'] = 'CH2'
        elif nb_row['atom_0_CH1'] == 1:
            update_map['main_group'] = 'CH1'
        elif nb_row['atom_0_CH0'] == 1:
            update_map['main_group'] = 'CH0'

        elif nb_row['atom_0_NH2'] == 1:
            update_map['main_group'] = 'NH2'
        elif nb_row['atom_0_NH1'] == 1:
            update_map['main_group'] = 'NH1'
        elif nb_row['atom_0_NH0'] == 1:
            update_map['main_group'] = 'NH0'

        elif nb_row['atom_0_OH1'] == 1:
            update_map['main_group'] = 'OH1'
            update_map['is_hydroxyl'] = 1        
        elif nb_row['atom_0_OH0'] == 1:
            update_map['main_group'] = 'OH0'

        if nb_row['is_in_ring'] == 1:
            update_map['in_ring'] = 1 

        if nb_row['has_aromatic_bond'] == 1:
            update_map['is_aromatic'] = 1 

        if nb_row['is_primary_amide'] == 1:
            update_map['is_primary_amide'] = 1 
        if nb_row['is_secondary_amide'] == 1:
            update_map['is_secondary_amide'] = 1 
        if nb_row['is_tertiary_amide'] == 1:
            update_map['is_tertiary_amide'] = 1 

        if nb_row['is_ester'] == 1:
            update_map['is_ester'] = 1 

        if nb_row['is_carbonyl'] == 1:
            update_map['is_carbonyl'] = 1 
        
    return update_map


def update_row(where, index, use_atom_1, nb_indexes_list, special_bond):
    # where have values -1: left; 0: center; 1: right

    if nb_indexes_list == 0:
        return;
    
    update_map = {'atom': '', 'main_group': '', 'in_ring': 0, 'is_aromatic': 0, 'is_primary_amide': 0,
                 'is_secondary_amide': 0, 'is_tertiary_amide': 0, 'is_ester': 0, 'is_carbonyl': 0, 'is_hydroxyl': 0}
    
    for nb_index in nb_indexes_list:
        nb_row = molecular_structures.iloc[nb_index]
        if use_atom_1:
            update_map['atom'] = nb_row['atom_symbol_1']
        else:
            update_map['atom'] = nb_row['atom_symbol_0']
            
        update_map = mark_functional_groups(update_map, nb_row, use_atom_1)
    
    if where == -1:
        molecular_structures.at[index, 'NB_left_atom'] = update_map['atom']
        molecular_structures.at[index, 'NB_left_group'] = '*' + update_map['main_group'] if special_bond else update_map['main_group']
    elif where == 0:
        molecular_structures.at[index, 'NB_center_atom'] = update_map['atom']
        molecular_structures.at[index, 'NB_center_group'] = '*' + update_map['main_group'] if special_bond else update_map['main_group']
    elif where == 1:
        molecular_structures.at[index, 'NB_right_atom'] = update_map['atom']
        molecular_structures.at[index, 'NB_right_group'] ='*' + update_map['main_group'] if special_bond else update_map['main_group']
    
    if molecular_structures.at[index, 'NB_in_ring'] == 0:
        molecular_structures.at[index, 'NB_in_ring'] = update_map['in_ring']
    if molecular_structures.at[index, 'NB_is_aromatic'] == 0:                                           
        molecular_structures.at[index, 'NB_is_aromatic'] = update_map['is_aromatic']
    if molecular_structures.at[index, 'NB_is_primary_amide'] == 0:
        molecular_structures.at[index, 'NB_is_primary_amide'] = update_map['is_primary_amide']
    if molecular_structures.at[index, 'NB_is_secondary_amide'] == 0:
        molecular_structures.at[index, 'NB_is_secondary_amide'] = update_map['is_secondary_amide']
    if molecular_structures.at[index, 'NB_is_tertiary_amide'] == 0:                                                    
        molecular_structures.at[index, 'NB_is_tertiary_amide'] = update_map['is_tertiary_amide']
    if molecular_structures.at[index, 'NB_is_ester'] == 0:
        molecular_structures.at[index, 'NB_is_ester'] = update_map['is_ester']
    if molecular_structures.at[index, 'NB_is_carbonyl'] == 0:                                                    
        molecular_structures.at[index, 'NB_is_carbonyl'] = update_map['is_carbonyl']
    if molecular_structures.at[index, 'NB_is_hydroxyl'] == 0:
        molecular_structures.at[index, 'NB_is_hydroxyl'] = update_map['is_hydroxyl']
              
    
exclude_char = [' ', ',']
    
def turn_str_into_int_list(string):
    if not string:
        return []
    if 's' in string:
        return []
    
    int_list = []
    for c in string[1:-1]:
        if c not in exclude_char:
                int_list.append(int(c))
    return int_list


counter = 0
for name, group in molecular_structures_grouped:  # PER MOLECULE

    counter += 1
    if counter % 5000 == 0:
        print(f'Molecules processed: {counter}')
        
    group_dict_0 = defaultdict(int)
    group_dict_1 = defaultdict(int)
    
    # FIND INDEXES FOR ATOMS (1 postfixed) IN MOLECULE_STRUCTURES
    for index, row in group.iterrows(): 
        if row.atom_0 in group_dict_0:
            group_dict_0[row.atom_0].append(index)
        else:
            group_dict_0[row.atom_0] = [index]
            
        if row.atom_1 in group_dict_1:
            group_dict_1[row.atom_1].append(index)
        else:
            group_dict_1[row.atom_1] = [index]
    
    
    for index, row in group.iterrows(): # PER ROW IN MOLECULE
        # Only work of H-X bonds, that is, decorate those with extra features
        if row.atom_symbol_0 == 'H':            
            parent_atom_index = row.atom_1

            no_of_NBs = turn_str_into_int_list(row.atom_1_1_level_NB)
            no_of_NBs_length = len(no_of_NBs)

            if no_of_NBs_length == 1:
                use_atom_1 = True
                key_list = group_dict_1[no_of_NBs[0]]
                if not key_list:
                    key_list = group_dict_0[no_of_NBs[0]]
                    use_atom_1 = False                    
                if no_of_NBs[0] < parent_atom_index:
                    # update left based column
                    update_row(-1, index, use_atom_1, key_list, False)
                else:
                    # update right based column
                    update_row(1, index, use_atom_1, key_list, False)
                # What about end =O and #N
                if len(key_list) == 1:
                    nb_row = molecular_structures.iloc[key_list[0]]
                    if nb_row['atom_symbol_0'] in ['N', 'O']:
                        if int(nb_row['atom_0']) > int(nb_row['atom_1']):
                            update_row(0, index, False, key_list, True)
                
            elif no_of_NBs_length == 2:
                use_atom_1_0 = True
                use_atom_1_1 = True
                key_list_0 = group_dict_1[no_of_NBs[0]]
                if not key_list_0:
                    key_list_0 = group_dict_0[no_of_NBs[0]]
                    use_atom_1_0 = False                    
                key_list_1 = group_dict_1[no_of_NBs[1]]
                if not key_list_1:
                    key_list_1 = group_dict_0[no_of_NBs[1]]
                    use_atom_1_1 = False                    
                if  no_of_NBs[0] < parent_atom_index:
                    # update left based column
                    update_row(-1, index, use_atom_1_0, key_list_0, False)
                    if no_of_NBs[1] <= parent_atom_index:
                        update_row(0, index, use_atom_1_1, key_list_1, False)
                    else:
                        update_row(1, index, use_atom_1_1, key_list_1, False)                                
                else:
                    # update cemter and right based columns
                    update_row(0, index, use_atom_1_0, key_list_0, False)
                    update_row(1, index, use_atom_1_1, key_list_1, False)                                
            elif no_of_NBs_length == 3:
                key_list_0 = group_dict_1[no_of_NBs[0]]
                key_list_1 = group_dict_1[no_of_NBs[1]]
                key_list_2 = group_dict_1[no_of_NBs[2]]
                use_atom_1_0 = True
                use_atom_1_1 = True
                use_atom_1_2 = True
                if not key_list_0:
                    key_list_0 = group_dict_0[no_of_NBs[0]]
                    use_atom_1_0 = False                    
                if not key_list_1:
                    key_list_1 = group_dict_0[no_of_NBs[1]]
                    use_atom_1_1 = False                    
                if not key_list_2:
                    key_list_2 = group_dict_0[no_of_NBs[2]]
                    use_atom_1_2 = False                    
                update_row(-1, index, use_atom_1_0, key_list_0, False)
                update_row(0, index, use_atom_1_1, key_list_1, False)                                
                update_row(1, index, use_atom_1_2, key_list_2, False)
 

Molecules processed: 5000
Molecules processed: 10000
Molecules processed: 15000
Molecules processed: 20000
Molecules processed: 25000
Molecules processed: 30000
Molecules processed: 35000
Molecules processed: 40000
Molecules processed: 45000
Molecules processed: 50000
Molecules processed: 55000
Molecules processed: 60000
Molecules processed: 65000
Molecules processed: 70000
Molecules processed: 75000
Molecules processed: 80000
Molecules processed: 85000


In [12]:
molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_050000']

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,NB_left_atom,NB_left_group,NB_center_atom,NB_center_group,NB_right_atom,NB_right_group,NB_all_groups,NB_in_ring,NB_is_aromatic,NB_is_primary_amide,NB_is_secondary_amide,NB_is_tertiary_amide,NB_is_ester,NB_is_carbonyl,NB_is_hydroxyl
547174,dsgdb9nsd_050000,1,C,0,O,{1},{1: {2}},0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2,1.204581,,,,,,,,0,0,0,0,0,0,0,0
547175,dsgdb9nsd_050000,2,C,1,C,"{0, 2}","{0: set(), 2: {3, 4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.494827,,,,,,,,0,0,0,0,0,0,0,0
547176,dsgdb9nsd_050000,9,H,1,C,"{0, 2}","{0: set(), 2: {3, 4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.112993,O,OH0,,,C,CH1,,1,0,0,0,0,0,1,0
547177,dsgdb9nsd_050000,4,C,2,C,"{1, 3, 4}","{4: {8, 3, 5}, 3: {4}, 1: {0}}",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1.481095,,,,,,,,0,0,0,0,0,0,0,0
547178,dsgdb9nsd_050000,3,O,2,C,"{1, 3, 4}","{4: {8, 3, 5}, 3: {4}, 1: {0}}",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1.438324,,,,,,,,0,0,0,0,0,0,0,0
547179,dsgdb9nsd_050000,10,H,2,C,"{1, 3, 4}","{4: {8, 3, 5}, 3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.089645,C,CH1,O,OH0,C,CH0,,1,0,0,0,0,0,0,0
547180,dsgdb9nsd_050000,4,C,3,O,"{2, 4}","{4: {8, 2, 5}, 2: {1, 4}}",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1.399189,,,,,,,,0,0,0,0,0,0,0,0
547181,dsgdb9nsd_050000,5,C,4,C,"{8, 2, 3, 5}","{5: {6}, 8: {6, 7}, 3: {2}, 2: {1, 3}}",0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1.544526,,,,,,,,0,0,0,0,0,0,0,0
547182,dsgdb9nsd_050000,8,C,4,C,"{8, 2, 3, 5}","{5: {6}, 8: {6, 7}, 3: {2}, 2: {1, 3}}",0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1.515803,,,,,,,,0,0,0,0,0,0,0,0
547183,dsgdb9nsd_050000,11,H,5,C,"{4, 6}","{6: {8, 7}, 4: {8, 2, 3}}",0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091936,C,CH0,,,C,CH1,,1,0,0,0,0,0,0,0


In [13]:
def calculate(row):
    NB_left_group = row['NB_left_group']
    NB_center_group = row['NB_center_group']
    NB_right_group = row['NB_right_group']
    
    entries = []
    if NB_left_group:
        entries.append(NB_left_group)
    if NB_center_group:
        entries.append(NB_center_group)
    if NB_right_group:
        entries.append(NB_right_group)
    
    entries.sort()
    
    str_representation = "-".join(entries)
    return str_representation

In [14]:
molecular_structures['NB_all_groups'] = molecular_structures.apply(lambda row : calculate(row), axis = 1)

In [15]:
# Let's test it is correct

df_test = molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_050000']

selected_columns = ['atom_0', 'atom_symbol_0', 'atom_1', 'atom_symbol_1', 'atom_1_1_level_NB', 'NB_left_group', 
                    'NB_center_group', 'NB_right_group', 'NB_all_groups', 'NB_in_ring', 'NB_is_aromatic', 
                    'NB_is_primary_amide', 'NB_is_secondary_amide', 'NB_is_tertiary_amide', 'NB_is_ester', 
                    'NB_is_carbonyl','NB_is_hydroxyl']

df_test[selected_columns]

Unnamed: 0,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,NB_left_group,NB_center_group,NB_right_group,NB_all_groups,NB_in_ring,NB_is_aromatic,NB_is_primary_amide,NB_is_secondary_amide,NB_is_tertiary_amide,NB_is_ester,NB_is_carbonyl,NB_is_hydroxyl
547174,1,C,0,O,{1},,,,,0,0,0,0,0,0,0,0
547175,2,C,1,C,"{0, 2}",,,,,0,0,0,0,0,0,0,0
547176,9,H,1,C,"{0, 2}",OH0,,CH1,CH1-OH0,1,0,0,0,0,0,1,0
547177,4,C,2,C,"{1, 3, 4}",,,,,0,0,0,0,0,0,0,0
547178,3,O,2,C,"{1, 3, 4}",,,,,0,0,0,0,0,0,0,0
547179,10,H,2,C,"{1, 3, 4}",CH1,OH0,CH0,CH0-CH1-OH0,1,0,0,0,0,0,0,0
547180,4,C,3,O,"{2, 4}",,,,,0,0,0,0,0,0,0,0
547181,5,C,4,C,"{8, 2, 3, 5}",,,,,0,0,0,0,0,0,0,0
547182,8,C,4,C,"{8, 2, 3, 5}",,,,,0,0,0,0,0,0,0,0
547183,11,H,5,C,"{4, 6}",CH0,,CH1,CH0-CH1,1,0,0,0,0,0,0,0


In [16]:
molecular_structures.to_csv('./input/molecular_structures_with_NB.csv', header=True, index=False) 