### The Data
This book will manipulate the molecular_structures.csv file to create even better features for the different 1JHX models


In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

input_folder = './input'

structures = pd.read_csv(f'{input_folder}/structures.csv')
molecular_structures = pd.read_csv(f'{input_folder}/molecule_structures.csv')


In [3]:
structures.loc[structures['molecule_name'] == 'dsgdb9nsd_000011']

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
46,dsgdb9nsd_000011,0,C,-0.002945,1.509914,0.008673
47,dsgdb9nsd_000011,1,C,0.026083,0.003276,-0.037459
48,dsgdb9nsd_000011,2,O,0.942288,-0.65507,-0.456826
49,dsgdb9nsd_000011,3,H,0.922788,1.926342,-0.391466
50,dsgdb9nsd_000011,4,H,-0.862015,1.878525,-0.564795
51,dsgdb9nsd_000011,5,H,-0.150506,1.843934,1.042891
52,dsgdb9nsd_000011,6,H,-0.89443,-0.486434,0.357749


In [4]:
f'Structures shape: {structures.shape}'

'Structures shape: (2358657, 6)'

In [5]:
molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000011']

Unnamed: 0,molecule_name,atom_0,atom_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
34,dsgdb9nsd_000011,4,0,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.096696
35,dsgdb9nsd_000011,3,0,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.091103
36,dsgdb9nsd_000011,1,0,{1},{1: {2}},0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.507624
37,dsgdb9nsd_000011,5,0,{1},{1: {2}},1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.096791
38,dsgdb9nsd_000011,2,1,"{0, 2}","{2: set(), 0: set()}",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,2,1.203628
39,dsgdb9nsd_000011,6,1,"{0, 2}","{2: set(), 0: set()}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.115056


In [6]:
f'Molecular_structures shape: {molecular_structures.shape}'

'Molecular_structures shape: (1586325, 32)'

In [7]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)

    df = df.drop('x', axis=1)
    df = df.drop('y', axis=1)
    df = df.drop('z', axis=1)

    df = df.rename(columns={'atom': f'atom_symbol_{atom_idx}'})
    
    return df

In [8]:
molecular_structures = map_atom_info(molecular_structures, 0)
molecular_structures = map_atom_info(molecular_structures, 1)

In [11]:
molecular_structures.columns'

list_order = ['molecule_name', 'atom_0',  'atom_symbol_0', 'atom_1', 'atom_symbol_1', 'atom_1_1_level_NB',
       'atom_1_2_level_NB', 'atom_0_CH3', 'atom_1_CH3', 'atom_0_CH2',
       'atom_1_CH2', 'atom_0_CH1', 'atom_1_CH1', 'atom_0_CH0', 'atom_1_CH0',
       'atom_0_NH2', 'atom_1_NH2', 'atom_0_NH1', 'atom_1_NH1', 'atom_0_NH0',
       'atom_1_NH0', 'atom_0_OH1', 'atom_1_OH1', 'atom_0_OH0', 'atom_1_OH0',
       'is_in_ring', 'has_aromatic_bond', 'is_primary_amide',
       'is_secondary_amide', 'is_tertiary_amide', 'is_ester', 'is_carbonyl',
       'bond_order', 'bond_length']

molecular_structures = molecular_structures[list_order]

In [34]:
#molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000050']
selected_molecules = molecular_structures.iloc[246:265]


In [41]:

selected_molecules

Unnamed: 0,molecule_name,atom_0,atom_symbol_0,atom_1,atom_symbol_1,atom_1_1_level_NB,atom_1_2_level_NB,atom_0_CH3,atom_1_CH3,atom_0_CH2,atom_1_CH2,atom_0_CH1,atom_1_CH1,atom_0_CH0,atom_1_CH0,atom_0_NH2,atom_1_NH2,atom_0_NH1,atom_1_NH1,atom_0_NH0,atom_1_NH0,atom_0_OH1,atom_1_OH1,atom_0_OH0,atom_1_OH0,is_in_ring,has_aromatic_bond,is_primary_amide,is_secondary_amide,is_tertiary_amide,is_ester,is_carbonyl,bond_order,bond_length
246,dsgdb9nsd_000050,1,C,0,N,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372028
247,dsgdb9nsd_000050,4,C,0,N,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.372027
248,dsgdb9nsd_000050,5,H,0,N,"{1, 4}","{1: {2}, 4: {3}}",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.004448
249,dsgdb9nsd_000050,2,C,1,C,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
250,dsgdb9nsd_000050,6,H,1,C,"{0, 2}","{2: {3}, 0: {4}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.07811
251,dsgdb9nsd_000050,7,H,2,C,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
252,dsgdb9nsd_000050,3,C,2,C,"{1, 3}","{3: {4}, 1: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1.423545
253,dsgdb9nsd_000050,8,H,3,C,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.079169
254,dsgdb9nsd_000050,4,C,3,C,"{2, 4}","{2: {1}, 4: {0}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,1.375401
255,dsgdb9nsd_000050,9,H,4,C,"{0, 3}","{3: {2}, 0: {1}}",0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.078109


In [40]:
molecular_structures_grouped = selected_molecules.groupby('molecule_name')

for name, group in molecular_structures_grouped:
    print(name)
    print('')
    #print(group)
    for i, j in group.iterrows():     
        print(f'I: {i}', j.atom_symbol_0) 
        # Only work of H-X bonds, that is, decorate those with extra features
        if  j.atom_symbol_0 == 'H':
            print('Is Hydrogen bond')
        
    print()

dsgdb9nsd_000050

I: 246 C
I: 247 C
I: 248 H
Is Hydrogen bond
I: 249 C
I: 250 H
Is Hydrogen bond
I: 251 H
Is Hydrogen bond
I: 252 C
I: 253 H
Is Hydrogen bond
I: 254 C
I: 255 H
Is Hydrogen bond

dsgdb9nsd_000051

I: 256 C
I: 257 C
I: 258 H
Is Hydrogen bond
I: 259 C
I: 260 H
Is Hydrogen bond
I: 261 H
Is Hydrogen bond
I: 262 N
I: 263 C
I: 264 H
Is Hydrogen bond



In [None]:
for i, j in molecular_structures.loc[molecular_structures['molecule_name'] == 'dsgdb9nsd_000050'].iterrows(): 
    
    print(f'I: {i}', j.atom_symbol_0) 
    # Only work of H-X bonds, that is, decorate those with extra features
    if  j.atom_symbol_0 == 'H':
        print('Is Hydrogen bond')
        
    print()

#### Creating new features

Find neighbour atoms and functional groups

In [None]:
# 2 left oriented features
NB_left_atom = []
NB_left_group = []