** This tutorial includes some important steps of data prepartion **

In [1]:
import pandas as pd
import torch
import rdkit
from rdkit import Chem

### Molecule Fragmentation

In our data preparation, the first step is to cut molecule into fragments. Here, we used Murck Fragmentation method.

In [2]:
import DataGen
from DataGen import fragMol
from DataGen.fragMol import fragMol
from DataGen.fragMol import get_number

#### Generate fragments

In [3]:
dir = "../test/fragmentation/" # file dir
infile = "test_fragments.csv" # infile name
column_name = "SMILES" # column name for SMILES
id = "idx" # column name for molecule index
fragMol(dir, infile, column_name, id, outfile=None, cal_occurrence=True)

Finish 0


In [4]:
mol_addfrag_file = pd.read_csv("../test/fragmentation/test_fragments_addfrags.csv")
mol_addfrag_file

Unnamed: 0,idx,SMILES,core_list,side_list
0,1,c1ccccc1C(=O)OCC,c1ccccc1,['CCOC=O']
1,2,C1N2C3CC4CC2C13C4,C1C2CC3N4CC3(C2)C14,['']
2,3,CN1CC1(C)C(F)(F)F,C1CN1,"['C', 'C', 'FC(F)F']"


In [5]:
frag_file = pd.read_csv("../test/fragmentation/fragments.csv")
frag_file

Unnamed: 0,mol_id,frags_id,frags_SMILES
0,1,0,CCOC=O
1,1,1,c1ccccc1
2,2,2,C1C2CC3N4CC3(C2)C14
3,3,3,C1CN1
4,3,4,C
5,3,5,FC(F)F


mol_id is the molecule id that corresponding fragment first appears

#### Generate frequency for fragments file

In [6]:
infile_frag = "../test/fragmentation/test_fragments_addfrags.csv"
absolute_number, molecule_number = get_number(infile_frag, core_column="core_list", side_column="side_list")
print(absolute_number) # the number of fragments appear in the data
print(molecule_number) # the number of molecules that have certain fragments

Finish 0
{'c1ccccc1': 1, 'CCOC=O': 1, 'C1C2CC3N4CC3(C2)C14': 1, 'C1CN1': 1, 'C': 2, 'FC(F)F': 1}
{'CCOC=O': 1, 'c1ccccc1': 1, 'C1C2CC3N4CC3(C2)C14': 1, 'C1CN1': 1, 'C': 1, 'FC(F)F': 1}


### Generate EFGs

EFG library has been generated using our in-house package.

In [7]:
from DataGen import genEFGs
from DataGen.genEFGs import get_EFGs_dic
from DataGen.genEFGs import get_EFGs

#### Get EFGs dic for molecules

In [8]:
help(get_EFGs_dic)

Help on function get_EFGs_dic in module DataGen.genEFGs:

get_EFGs_dic(dataset, intype, isomeric=True)
    Get EFGs_dic, which uses EFGs as keys, and list of index of mols that have this EFG as value



In [9]:
dataset = list(pd.read_csv("../test/fragmentation/test_fragments.csv")["SMILES"])
intype = "smiles"
isomeric = False # only for 3D input
EFGs_dic = get_EFGs_dic(dataset, intype, isomeric)

0


In [10]:
EFGs_dic # key is the EFG, value is the list of molecule index that have corresponding EFG

{'O=CO': [0],
 'c1ccccc1': [0],
 'CC': [0],
 'C040': [1, 2],
 'N': [1],
 'C030': [1],
 'C020': [1],
 'C1CN1': [2],
 'F': [2],
 'C010': [2]}

In [11]:
### Get EFGs dic with frequency
EFGs_dic_frequency = {key:len(value) for key, value in EFGs_dic.items()}
EFGs_dic_frequency 

{'O=CO': 1,
 'c1ccccc1': 1,
 'CC': 1,
 'C040': 2,
 'N': 1,
 'C030': 1,
 'C020': 1,
 'C1CN1': 1,
 'F': 1,
 'C010': 1}

In [12]:
torch.save(EFGs_dic_frequency, "../test/fragmentation/EFG_lib.pt")

In [13]:
### Use cutoff to only keep Top n% of EFGs. Here, large EFG can be cut into small EFG untill converge
import EFGs
from EFGs import cleavage
cutoff = 0.9
cleavage(EFGs_dic_frequency, alpha=cutoff, isomericSmiles=False)
EFGs_dic_frequency

{'C040': 2}

#### Get molecule EFGs based on generated EFG lib

In [14]:
mol = Chem.MolFromSmiles(dataset[0])

In [15]:
import EFGs
from EFGs import mol2frag
vocab = list(torch.load("../test/fragmentation/EFG_lib.pt")) # get the EFG lib
a,b = mol2frag(mol, vocabulary=vocab, toEnd=True, extra_included=True) # generate EFGs for molecule based on given EFG lib
a,b

(['O=CO', 'c1ccccc1'], ['CC'])

### Molecule Selection

Selecting molecules using different selection method

In [16]:
from DataGen import selectMol
from DataGen.selectMol import FreqEFGSample

#### Selecting molecules based on frequency and EFG lib

In [17]:
EFG_dic = {"C":[0,2,3], "N": [1,2], "F":[4,5,6]}
Freq_list = [20,1,15,2,5,10,6]
### sample molecules based on cutoff and iteration
freq_sampled = FreqEFGSample(input_dict=EFG_dic, frequency_list=Freq_list, cutoff=1, iterations=1)

In [18]:
freq_sampled.sampled_index

iters:1
number of sampled index:0


[0, 5, 2]

In [19]:
EFG_dic = {"C":[0,2,3], "N": [1,2], "F":[4,5,6]}
Freq_list = [20,1,15,2,5,10,6]
### sample molecules based on given numbers
freq_sampled = FreqEFGSample(input_dict=EFG_dic, frequency_list=Freq_list, cutoff=1, numbers=3)

In [20]:
freq_sampled.sampled_index

iters:1
number of sampled index:0


[0, 5, 2]