### The Data
This book will manipulate the molecular_structures.csv file to create even better features for the different 1JHX models


In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

input_folder = './input'

structures = pd.read_csv(f'{input_folder}/structures.csv')
molecular_structures = pd.read_csv(f'{input_folder}/molecule_structures.csv')


In [3]:
structures.loc[structures['molecule_name'] == 'dsgdb9nsd_000011']

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
46,dsgdb9nsd_000011,0,C,-0.002945,1.509914,0.008673
47,dsgdb9nsd_000011,1,C,0.026083,0.003276,-0.037459
48,dsgdb9nsd_000011,2,O,0.942288,-0.65507,-0.456826
49,dsgdb9nsd_000011,3,H,0.922788,1.926342,-0.391466
50,dsgdb9nsd_000011,4,H,-0.862015,1.878525,-0.564795
51,dsgdb9nsd_000011,5,H,-0.150506,1.843934,1.042891
52,dsgdb9nsd_000011,6,H,-0.89443,-0.486434,0.357749


In [4]:
f'Structures shape: {structures.shape}'

'Structures shape: (2358657, 6)'

In [62]:
molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000011']

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,NB_left_atom,NB_left_group,NB_center_atom,NB_center_group,NB_right_atom,NB_right_group,NB_in_ring,NB_is_aromatic,NB_is_primary_amide,NB_is_secondary_amide,NB_is_tertiary_amide,NB_is_ester,NB_is_carbonyl,NB_is_hydroxyl
34,dsgdb9nsd_000011,4,H,0,C,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.096696,0,0,0,0,0,0,0,0,0,0,0,0,0,0
35,dsgdb9nsd_000011,3,H,0,C,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091103,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36,dsgdb9nsd_000011,1,C,0,C,{1},{1: {2}},0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.507624,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37,dsgdb9nsd_000011,5,H,0,C,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.096791,0,0,0,0,0,0,0,0,0,0,0,0,0,0
38,dsgdb9nsd_000011,2,O,1,C,"{0, 2}","{2: set(), 0: set()}",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,2,1.203628,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39,dsgdb9nsd_000011,6,H,1,C,"{0, 2}","{2: set(), 0: set()}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.115056,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
f'Molecular_structures shape: {molecular_structures.shape}'

'Molecular_structures shape: (1586325, 32)'

In [7]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)

    df = df.drop('x', axis=1)
    df = df.drop('y', axis=1)
    df = df.drop('z', axis=1)

    df = df.rename(columns={'atom': f'atom_symbol_{atom_idx}'})
    
    return df

In [8]:
molecular_structures = map_atom_info(molecular_structures, 0)
molecular_structures = map_atom_info(molecular_structures, 1)

In [11]:
molecular_structures.columns'

list_order = ['molecule_name', 'atom_0',  'atom_symbol_0', 'atom_1', 'atom_symbol_1', 'atom_1_1_level_NB',
       'atom_1_2_level_NB', 'atom_0_CH3', 'atom_1_CH3', 'atom_0_CH2',
       'atom_1_CH2', 'atom_0_CH1', 'atom_1_CH1', 'atom_0_CH0', 'atom_1_CH0',
       'atom_0_NH2', 'atom_1_NH2', 'atom_0_NH1', 'atom_1_NH1', 'atom_0_NH0',
       'atom_1_NH0', 'atom_0_OH1', 'atom_1_OH1', 'atom_0_OH0', 'atom_1_OH0',
       'is_in_ring', 'has_aromatic_bond', 'is_primary_amide',
       'is_secondary_amide', 'is_tertiary_amide', 'is_ester', 'is_carbonyl',
       'bond_order', 'bond_length']

molecular_structures = molecular_structures[list_order]

#### Creating new features

Find neighbour atoms and functional groups

In [60]:
# 2 left oriented features

df_length = molecular_structures.shape[0]

molecular_structures['NB_left_atom'] = [0]*df_length
molecular_structures['NB_left_group'] = [0]*df_length

molecular_structures['NB_center_atom'] = [0]*df_length
molecular_structures['NB_center_group'] = [0]*df_length

molecular_structures['NB_right_atom'] = [0]*df_length
molecular_structures['NB_right_group'] = [0]*df_length

molecular_structures['NB_in_ring'] = [0]*df_length
molecular_structures['NB_is_aromatic'] = [0]*df_length

molecular_structures['NB_is_primary_amide'] = [0]*df_length
molecular_structures['NB_is_secondary_amide'] = [0]*df_length
molecular_structures['NB_is_tertiary_amide'] = [0]*df_length
molecular_structures['NB_is_ester'] = [0]*df_length
molecular_structures['NB_is_carbonyl'] = [0]*df_length
molecular_structures['NB_is_hydroxyl'] = [0]*df_length


#new_features = pd.DataFrame([NB_left_atom, NB_left_group, NB_center_atom, NB_center_group, NB_right_atom, NB_right_group,
#                            NB_in_ring, NB_is_aromatic], columns=['NB_left_atom', 'NB_left_group', 'NB_center_atom', 
#                                                                  'NB_center_group', 'NB_right_atom', 'NB_right_group',
#                                                                  'NB_in_ring', 'NB_is_aromatic'])

molecular_structures.head()

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length,NB_left_atom,NB_left_group,NB_center_atom,NB_center_group,NB_right_atom,NB_right_group,NB_in_ring,NB_is_aromatic
0,dsgdb9nsd_000001,3,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091946,0,0,0,0,0,0,0,0
1,dsgdb9nsd_000001,2,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091952,0,0,0,0,0,0,0,0
2,dsgdb9nsd_000001,1,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091953,0,0,0,0,0,0,0,0
3,dsgdb9nsd_000001,4,H,0,C,set(),{},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091948,0,0,0,0,0,0,0,0
4,dsgdb9nsd_000002,3,H,0,N,set(),{},0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.017208,0,0,0,0,0,0,0,0


In [52]:
#molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000050']
selected_molecules = molecular_structures.iloc[246:265]


In [53]:

selected_molecules

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
246,dsgdb9nsd_000050,1,C,0,N,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372028
247,dsgdb9nsd_000050,4,C,0,N,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372027
248,dsgdb9nsd_000050,5,H,0,N,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.004448
249,dsgdb9nsd_000050,2,C,1,C,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
250,dsgdb9nsd_000050,6,H,1,C,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.07811
251,dsgdb9nsd_000050,7,H,2,C,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
252,dsgdb9nsd_000050,3,C,2,C,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.423545
253,dsgdb9nsd_000050,8,H,3,C,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
254,dsgdb9nsd_000050,4,C,3,C,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
255,dsgdb9nsd_000050,9,H,4,C,"{0, 3}","{3: {2}, 0: {1}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.078109


In [51]:
molecular_structures_grouped = selected_molecules.groupby('molecule_name')

for name, group in molecular_structures_grouped:
    group_dict = {}
    for index, row in group.iterrows(): 
        if row.atom_1 in group_dict:
             group_dict[row.atom_1].append(index)
        else:
            group_dict[row.atom_1] = [index]
    print(group_dict)

{0: [246, 247, 248], 1: [249, 250], 2: [251, 252], 3: [253, 254], 4: [255]}
{0: [256, 257, 258], 1: [259, 260], 2: [261, 262], 3: [263], 4: [264]}


In [48]:
molecular_structures_grouped = selected_molecules.groupby('molecule_name')

print(molecular_structures_grouped.groups)
print()
print(molecular_structures_grouped.indices)
print()


for name, group in molecular_structures_grouped:
    group_dict = {}
    for index, row in group.iterrows(): 
        if row.atom_1 in group_dict:
             group_dict[row.atom_1].append(index)
        else:
            group_dict[row.atom_1] = [index]

    print(group_dict)    for index, row in group.iterrows():     
    print(f'Index: {index}', row.atom_symbol_0) 
    # Only work of H-X bonds, that is, decorate those with extra features
    if row.atom_symbol_0 == 'H':            
        parent_atom_index = row.atom_1
        print(f'Atom 0 is Hydrogen it is connected to {parent_atom_index} : {row.atom_symbol_1}')
        NB_level_1 = row.atom_1_1_level_NB
        for atom in NB_level_1:
            
            
        
    print()

{'dsgdb9nsd_000050': Int64Index([246, 247, 248, 249, 250, 251, 252, 253, 254, 255], dtype='int64'), 'dsgdb9nsd_000051': Int64Index([256, 257, 258, 259, 260, 261, 262, 263, 264], dtype='int64')}

{'dsgdb9nsd_000050': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64), 'dsgdb9nsd_000051': array([10, 11, 12, 13, 14, 15, 16, 17, 18], dtype=int64)}



KeyError: 'Column not found: 1'

In [None]:
for i, j in molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000050'].iterrows(): 
    
    print(f'I: {i}', j.atom_symbol_0) 
    # Only work of H-X bonds, that is, decorate those with extra features
    if  j.atom_symbol_0 == 'H':
        print('Is Hydrogen bond')
        
    print()

#### Creating new features

Find neighbour atoms and functional groups

In [None]:
# 2 left oriented features
NB_left_atom = [0]*
NB_left_group = [0]