## First Notes

Please do not run the file as whole, as some cells include some irrelevant code that was created for testing some examples. The cells to be ignored are noted by a **BEGIN:** and **END;** notations.

### Modules Import

In [1]:
from __future__ import print_function
from rdkit import Chem

### Reading data

Reading a set of molecules an `SDMolSupplier` from rdkit.<br>
In the [getting started](https://www.rdkit.org/docs/GettingStartedInPython.html) documentation there're two these two different suppliers, but I haven't looked up the difference yet:
* `rdkit.Chem.rdmolfiles.SDMolSupplier`
* `rdkit.Chem.rdmolfiles.SmilesMolSupplier`

In [2]:
## Read the file
supplier = Chem.SDMolSupplier('data/cas_4337.sdf')

In [6]:
mol = supplier[0]
mol
list(mol.GetPropNames())
mol.GetProp("Ames test categorisation")

'mutagen'

In [4]:
## for each molecule, get the number of atoms in it
for mol in supplier:
    print(mol.GetNumAtoms())

25
13
20
10
10
14
18
24
10
24
18
26
18
13
9
17
13
23
6
10
11
13
14
12
21
9
13
13
16
28
21
30
10
10
17
19
14
17
15
11
17
39
18
16
22
17
10
14
5
21
22
4
29
14
20
8
15
18
13
23
9
12
15
9
12
13
8
22
26
13
8
14
19
6
14
14
20
9
12
17
9
21
18
11
18
9
24
51
16
19
13
18
3
21
19
21
12
40
27
18
16
15
8
27
7
26
13
20
12
7
10
9
21
21
13
11
9
22
4
8
5
21
9
20
15
13
37
7
25
14
15
22
16
6
22
10
17
21
21
7
8
15
6
14
25
21
16
10
17
13
22
16
19
29
15
25
22
5
37
33
10
17
102
21
21
18
18
20
11
21
14
11
9
9
23
21
25
8
14
5
14
25
5
12
18
39
18
10
24
13
11
19
13
22
23
19
19
17
20
5
20
19
14
19
8
9
11
18
16
21
8
36
23
14
16
4
12
6
15
20
11
10
20
25
15
6
17
20
18
13
18
16
8
22
13
10
7
25
23
9
16
14
13
9
5
5
18
12
17
20
18
10
12
21
34
3
9
18
7
35
25
15
10
9
16
22
38
7
11
9
19
24
5
12
8
8
20
22
6
11
13
23
12
23
18
20
11
14
20
6
20
40
11
43
16
52
18
13
23
11
13
12
11
18
7
8
10
19
11
8
11
26
9
11
11
16
21
18
15
13
22
18
13
13
19
9
14
14
19
15
5
10
14
10
10
10
16
12
15
16
13
19
9
19
17
15
10
20
16
21
8
12
14
14
22
1

19
20
12
19
5
21
13
20
9
6
28
17
8
22
12
16
8
4
18
12
10
56
19
22
17
19
30
8
18
7
8
12
20
28
62
14
8
13
15
18
10
16
20
11
20
50
9
20
24
14
19
14
11
18
8
4
18
9
25
18
11
37
5
12
17
9
12
14
14
17
13
14
17
18
39
5
12
15
14
6
26
10
8
10
18
13
10
11
11
22
16
32
20
7
15
11
24
6
21
11
10
18
14
19
12
10
13
8
21
10
13
18
17
20
16
8
9
6
21
11
11
14
10
19
6
13
9
12
10
20
23
11
12
15
10
12
19
10
35
22
10
22
14
14
19
27
12
10
9
11
13
9
12
11
12
7
18
22
9
18
19
14
18
8
23
15
6
6
8
17
50
20
11
5
18
7
23
7
22
16
21
17
68
16
17
13
18
17
16
20
7
24
18
14
36
6
8
12
2
12
8
10
15
15
9
5
7
18
16
9
7
12
17
10
19
16
12
24
13
22
15
12
19
7
15
9
14
22
21
11
11
17
25
18
10
15
5
8
17
11
11
6
15
18
5
11
9
36
7
20
23
14
12
11
24
23
11
13
17
18
17
19
19
14
18
12
16
9
15
11
7
17
20
6
12
16
23
17
16
9
14
12
12
8
11
21
15
15
19
15
20
11
12
20
33
28
6
19
19
9
13
6
13
9
108
13
2
4
24
17
17
8
17
24
16
19
17
13
12
19
10
20
22
12
14
7
18
14
24
32
27
7
11
17
10
15
20
49
26
4
30
17
19
12
9
17
12
19
9
10
16
41
20
19
19
15
18
2

We have 4337 molecules in the data set:

In [8]:
len(supplier)

4337

A good practice is to test each molecule to see if it was correctly read before working with it:

In [10]:
for mol in supplier:
    if mol is None:
        print("a None molecule was found!")
        break
    print(mol.GetNumAtoms())

25
13
20
10
10
14
18
24
10
24
18
26
18
13
9
17
13
23
6
10
11
13
14
12
21
9
13
13
16
28
21
30
10
10
17
19
14
17
15
11
17
39
18
16
22
17
10
14
5
21
22
4
29
14
20
8
15
18
13
23
9
12
15
9
12
13
8
22
26
13
8
14
19
6
14
14
20
9
12
17
9
21
18
11
18
9
24
51
16
19
13
18
3
21
19
21
12
40
27
18
16
15
8
27
7
26
13
20
12
7
10
9
21
21
13
11
9
22
4
8
5
21
9
20
15
13
37
7
25
14
15
22
16
6
22
10
17
21
21
7
8
15
6
14
25
21
16
10
17
13
22
16
19
29
15
25
22
5
37
33
10
17
102
21
21
18
18
20
11
21
14
11
9
9
23
21
25
8
14
5
14
25
5
12
18
39
18
10
24
13
11
19
13
22
23
19
19
17
20
5
20
19
14
19
8
9
11
18
16
21
8
36
23
14
16
4
12
6
15
20
11
10
20
25
15
6
17
20
18
13
18
16
8
22
13
10
7
25
23
9
16
14
13
9
5
5
18
12
17
20
18
10
12
21
34
3
9
18
7
35
25
15
10
9
16
22
38
7
11
9
19
24
5
12
8
8
20
22
6
11
13
23
12
23
18
20
11
14
20
6
20
40
11
43
16
52
18
13
23
11
13
12
11
18
7
8
10
19
11
8
11
26
9
11
11
16
21
18
15
13
22
18
13
13
19
9
14
14
19
15
5
10
14
10
10
10
16
12
15
16
13
19
9
19
17
15
10
20
16
21
8
12
14
14
22
1

12
9
17
12
19
9
10
16
41
20
19
19
15
18
21
5
24
8
13
21
21
11
9
19
11
18
11
30
20
12
6
26
37
15
16
22
4
18
5
26
17
28
17
14
22
8
30
18
10
17
24
15
26
20
15
16
18
11
12
13
15
18
17
14
8
61
28
17
14
11
19
16
6
21
11
10
9
7
20
13
22
19
6
17
19
13
15
19
23
7
12
15
20
12
24
9
29
15
22
6
11
12
10
25
18
18
12
24
19
7
17
30
16
16
17
21
21
14
23
13
9
14
18
24
18
11
14
7
28
14
12
23
8
14
13
6
16
8
11
24
14
17
13
20
20
20
12
19
27
22
19
20
23
19
24
14
5
21
4
7
12
6
13
16
11
19
20
21
17
10
17
9
16
25
19
10
7
12
12
17
14
11
22
19
24
13
12
46
26
19
12
31
15
18
18
7
17
28
9
17
17
32
11
16
14
16
32
15
13
11
26
14
19
21
28
7
11
5
15
18
16
13
36
8
16
21
10
12
19
9
22
19
8
33
7
11
12
21
13
21
10
11
8
14
10
20
10
19
17
12
20
28
11
19
10
16
6
24
7
14
25
13
17
22
14
6
22
17
12
5
10
5
3
10
11
13
11
10
15
20
31
4
21
14
4
27
18
10
19
11
8
25
11
4
11
5
13
7
15
23
18
18
10
24
22
16
19
25
25
12
47
10
18
20
18
9
14
18
12
11
13
27
11
8
23
17
9
15
18
9
22
17
14
9
5
13
18
11
9
8
27
13
18
15
12
5
20
4
15
24
14
22
11
2

---
*The following code can be ignored, cause this was generated for my own interest and testing.*

---
BEGIN:

In [11]:
m = Chem.MolFromSmiles('C1OC1')
for atom in m.GetAtoms():
    print(atom.GetAtomicNum())

print(m.GetBonds()[1].GetBondType())

6
8
6
SINGLE


END;

----

### Calculating Morgan fingerprints

In [11]:
import numpy as np

In [5]:
from rdkit.Chem import AllChem

**\_\_TASK\_\_:** for each molecule calculate MorganFingerprints (with radius <b>3</b>) and size **~2048** (rdkit has also a nice easy function for that)

In [None]:
info = {}
# fingerprints = [list(AllChem.GetMorganFingerprint(mol, 3, bitInfo=info)) for mol in supplier]

fingerprints_asBitVector = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=2048, bitInfo=info) for mol in supplier]
# fingerprints=[list(fp) for fp in fingerprints_asBitVector]



In [18]:
fingerprints = np.array(fingerprints)
fingerprints.shape
fingerprints[0]

<rdkit.DataStructs.cDataStructs.UIntSparseIntVect at 0x1112ca350>

#### Atoms contributing to the activation of Morgan fingerprints

**\_\_TASK\_\_:** Important! when you calculate the Fingerprints, save which atoms where responsible for the activation of the fingerprint (rdkit can also do that)

Information is available about the atoms that contribute to particular bits in the Morgan fingerprint via the bitInfo argument. The dictionary provided is populated with one entry per bit set in the fingerprint, the keys are the bit ids, the values are lists of (atom index, radius) tuples.

So we would need to modify the previous line of code to the following:

In [18]:
info = {}
# fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=2048, bitInfo=info) for m in supplier]
fingerprints = [AllChem.GetMorganFingerprint(mol, 3, bitInfo=info) for mol in]


In [22]:
fp = fingerprints[0]
fp

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x10d674120>