In [53]:
# Imports all the required packages
from packages import *

In [54]:
class SMILESCONVERT(InMemoryDataset):
    """ 
    Function Definitions

    Args:
        root (string): Root directory where the dataset should be saved.

        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)

        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)

        pre_filter (callable, optional): A function that takes in an
            :obj:`torch_geometric.data.Data` object and returns a boolean
            value, indicating whether the data object should be included in the
            final dataset. (default: :obj:`None`)
    """


    # To convert the smiles data structure to be worked upon we need to apply one hot encoding to the strings
 
    '''
    Add all the molecules present in here by Abhishek
    '''
    types = {'H': 0, 'C': 1, 'O': 2, 'F':3} # atom types
    bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3} #bond types

    '''
    Taken from torch-geometric tutorial to convert raw data to torch geometric dataset
    '''
    def __init__(self, root, transform=None, pre_transform=None,pre_filter=None):
        super(SMILESCONVERT, self).__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return 'raw_data.csv' # Returns the raw file

    @property
    def processed_file_names(self):
        return 'processed_final_data.pt'

    def process(self):
        
        molecules = []
        
        # Reading from a file
        with open(self.raw_paths[0], 'r') as f:
            data = f.read().split('\n')[1:-1] #skip the header
            molecules = [[str(x) for x in line.split(",")[0:1]] for line in data]  # Since the raw data is in csv format, we use delimeter as ','
            writer = Chem.SDWriter(str(self.root) + '/raw/raw_data.sdf')
            for m in molecules:
                mol = Chem.rdmolfiles.MolFromSmiles(m[0])
                writer.write(mol)
            del writer


            target = [[float(x) for x in line.split(",")[1:2]] for line in data] 
                      
            target = torch.tensor(target, dtype=torch.float) # Stores the target variable as torch tensor which is boiling point

        # delay for proper saving of sdf file
        time.sleep(10)

        # Examples taken from rdkit tutorial   
        dataset = str(self.root) + '/raw/raw_data.sdf'
        suppl = Chem.SDMolSupplier(dataset, removeHs=False) # removeHs is used to remove Hydrogen from molecules
        fdef_name = osp.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
        factory = ChemicalFeatures.BuildFeatureFactory(fdef_name)

        data_list = []
        max_len = 0
        max_name = ''
        
        # Check so that the final dimensions are same for X and Y
        if len(target) == len(suppl):
            print('True')
        else:
            print('The Target shape and Input shape donot match')

        for i, mol in enumerate(suppl):
            if mol is None:
                print('Invalid molecule (None)')
                continue

            text = suppl.GetItemText(i)
            N = mol.GetNumAtoms() # Get Number of Atoms

            # Consider only those molecules with more than one atom. Need at least one bond for a valid graph edge
            if N <= 1:
                print('Warning: Molecule as only 1 atom is present')
                continue

            # atom features
            type_idx = []
            aromatic = []
            ring = []
            sp = []
            sp2 = []
            sp3 = []
            sp3d = []
            sp3d2 = []
            num_hs = []
            num_neighbors = []
            for atom in mol.GetAtoms():
                type_idx.append(self.types[atom.GetSymbol()])
                aromatic.append(1 if atom.GetIsAromatic() else 0)
                ring.append(1 if atom.IsInRing() else 0)
                hybridization = atom.GetHybridization()
                sp.append(1 if hybridization == HybridizationType.SP else 0)
                sp2.append(1 if hybridization == HybridizationType.SP2 else 0)
                sp3.append(1 if hybridization == HybridizationType.SP3 else 0)
                sp3d.append(1 if hybridization == HybridizationType.SP3D else 0)
                sp3d2.append(1 if hybridization == HybridizationType.SP3D2 else 0)
                num_hs.append(atom.GetTotalNumHs(includeNeighbors=True))
                num_neighbors.append(len(atom.GetNeighbors()))

            x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(self.types))
            x2 = torch.tensor([aromatic, ring, sp, sp2, sp3, sp3d, sp3d2], dtype=torch.float).t().contiguous()
            x3 = F.one_hot(torch.tensor(num_neighbors), num_classes=6)
            x4 = F.one_hot(torch.tensor(num_hs), num_classes=5)
            x = torch.cat([x1.to(torch.float), x2, x3.to(torch.float),x4.to(torch.float)], dim=-1)

            # bond features
            row, col, bond_idx, conj, ring, stereo = [], [], [], [], [], []
            for bond in mol.GetBonds():
                start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
                row += [start, end]
                col += [end, start]
                bond_idx += 2 * [self.bonds[bond.GetBondType()]]
                conj.append(bond.GetIsConjugated())
                conj.append(bond.GetIsConjugated())
                ring.append(bond.IsInRing())
                ring.append(bond.IsInRing())
                stereo.append(bond.GetStereo())
                stereo.append(bond.GetStereo())

            edge_index = torch.tensor([row, col], dtype=torch.long)
            e1 = F.one_hot(torch.tensor(bond_idx),num_classes=len(self.bonds)).to(torch.float)
            e2 = torch.tensor([conj, ring], dtype=torch.float).t().contiguous()
            e3 = F.one_hot(torch.tensor(stereo),num_classes=6).to(torch.float)
            edge_attr = torch.cat([e1, e2, e3], dim=-1)
            edge_index, edge_attr = coalesce(edge_index, edge_attr, N, N)

            # Store target data in y
            mol_id = i+1
            y = target[i].unsqueeze(0)  # Returns in this form tensor([[ 1,  2,  3,  4]])

            # Transform SMILES into ascii data type and store it in a name torch tensor
            name = str(Chem.MolToSmiles(mol))
            ascii_name = []
            for c in name:
                ascii_name.append(int(ord(c)))

            if len(ascii_name) > max_len:
                max_len = len(ascii_name)
                max_name = name

            ## if fails, increase range
            for i in range(len(ascii_name), 300):
                ascii_name.append(0)

            ascii_name = torch.tensor([ascii_name], dtype=torch.float).contiguous()

            # print current molecule with target data
            print(str(name) + ': ' + str(y.item()))

            # Save data
            data = Data(x=x, edge_index=edge_index,edge_attr=edge_attr, y=y, mol_id=ascii_name)
            # In the above case we are not using spatial position of atom as a feature due to it's complexity

            if self.pre_filter is not None and not self.pre_filter(data):
                continue
            if self.pre_transform is not None:
                data = self.pre_transform(data)
            data_list.append(data)

        torch.save(self.collate(data_list), self.processed_paths[0])


In [55]:
# Block test cell

molecules = []
with open('data/raw_data.csv', 'r') as f:
            data = f.read().split('\n')[1:-1] #skip the header
            molecules = [[str(x) for x in line.split(",")[0:1]] for line in data]  # Since the raw data is in csv format, we use delimeter as ','
            #print(molecules)
            writer = Chem.SDWriter('data/raw_data.sdf')
            for m in molecules:
                mol = Chem.rdmolfiles.MolFromSmiles(m[0])
                #print(mol)
                writer.write(mol)
            del writer
            
            target = [[float(x) for x in line.split(",")[1:2]] for line in data] #('<separator of letter>') in split, whitespace corresponds to blank parentheses, [<number of targets>]
            #print(target)          
            target = torch.tensor(target, dtype=torch.float)

'''
Checking done. Implement this code and changes
'''

'\nChecking done. Implement this code and changes\n'

In [57]:
from k_gnn import transform
class MyFilter(object):
    def __call__(self, data):
        return data.num_nodes > 1  # Remove graphs with less than 2 nodes.
class MyPreTransform(object):
    def __call__(self, data):
        x = data.x
        data.x = data.x[:, :3]   # only consider atom types (H,C,O) of atom features vectors for determining isomorphic type in kgnn
        data = TwoLocal()(data)   # create higher-dimensional graph (2)
        data.x = x
        return data

dataset = SMILESCONVERT('data/',pre_transform=MyPreTransform(),pre_filter=MyFilter())


ModuleNotFoundError: No module named 'graph_cpu'