In [2]:
import mindspore as ms
import mindspore.nn as nn
import mindspore.ops as ops


In [3]:
# 选取数据

import pickle

with open('/home/user02/code/数据/crossdocked_pocket10/index.pkl', 'rb') as f:
    index = pickle.load(f)
    index.reverse()

index1 = []
for i in range(len(index)):
    if index[i][0] == None: continue

    if '1B57_HUMAN_25_300_0' in index[i][0]:

        # print(index[i][0])
        index1.append(index[i])
print(len(index1))

# 加载配置
from utils.misc import load_config

config = load_config('/home/user02/CProMG_ms/configs/CPrMG-VQSLT.yml')

18


In [4]:
# 定义预处理类

class FeaturizeProteinAtom(object):

    def __init__(self):
        super().__init__()
        self.atomic_numbers = ms.Tensor([1, 6, 7, 8, 16, 34], ms.int64)    # H, C, N, O, S, Se
        self.max_num_aa = 20


    @property
    def feature_dim(self):
        return self.atomic_numbers.size + 1

    def __call__(self, data):

        element = data.protein_element.view(-1, 1) == self.atomic_numbers.view(1, -1) # (N_atoms, N_elements) onehot    
        element = element.to(ms.int64)   
        is_backbone = data.protein_is_backbone.view(-1, 1).long()
        x = ops.cat([element, is_backbone], axis=-1)
        data.protein_atom_feature = x
        # del data.protein_molecule_name, data.protein_is_backbone, data.protein_atom_name, data.protein_atom_to_aa_type
        return data
    
class FeaturizeProteinResidue(object):

    def __init__(self):
        super().__init__()
        self.max_num_aa = 20

    @property
    def feature_dim(self):
        return self.max_num_aa 

    def __call__(self, data):
        # amino_acid = F.one_hot(data.residue_amino_acid.to(ms.int64), num_classes=self.max_num_aa)
        amino_acid = ops.one_hot(data.residue_amino_acid.to(ms.int64), self.max_num_aa)
        data.residue_feature = amino_acid

        return data

# 实例化
protein_featurizer = FeaturizeProteinAtom()
residue_featurizer = FeaturizeProteinResidue()

In [5]:
# 读取数据
import os
import pandas as pd
from utils.protein_ligand import PDBProtein, parse_sdf_file
from utils.data import add_prefix

class KeyDict(dict):
    def __getattr__(self, key):
        try:
            return self[key]
        except KeyError:
            raise AttributeError(key)
        
def pad_row(arr, max_len):
    
    arr = ms.Tensor(arr)
    zero_arr = ops.zeros((max_len, arr.shape[1]))
    zero_arr[:arr.shape[0], :arr.shape[1]] = arr
    padded_arr = zero_arr
    mask = ops.zeros((max_len, 1))
    mask[:arr.shape[0], 0] = 1
    return padded_arr, mask


raw_path = './data/crossdocked'
df = pd.read_csv('./data/dock_scores.csv')
data_lis = []

for i, (pocket_fn, ligand_fn, _, rmsd_str) in enumerate(index1):
    pocket_dict = PDBProtein(os.path.join(raw_path, pocket_fn)).to_dict_atom()
    residue_dict = PDBProtein(os.path.join(raw_path, pocket_fn)).to_dict_residue()
    ligand_dict = parse_sdf_file(config,os.path.join(raw_path, ligand_fn))
    data = add_prefix(pocket_dict, residue_dict, ligand_dict)
    data = KeyDict(data)
    try:
        data['vina_score'] = float(df[df.loc[:,'4']==ligand_fn].loc[:,'5'].item())
    except:
        data['vina_score'] = float(0)

    data.num_nodes = data.protein_element.size

    protein_featurizer(data)
    residue_featurizer(data)
    new_data = {}
    new_data['protein_atom_feature'], new_data['protein_atom_feature_mask'] = pad_row(data.protein_atom_feature, max_len=600)
    new_data['residue_feature'], new_data['residue_feature_mask'] = pad_row(data.residue_feature, 100)
    keys_to_add = ['ligand_smile', 'ligand_sas', 'ligand_logP', 'ligand_qed', 'ligand_tpsa', 'ligand_smiIndices_input', 'ligand_smiIndices_tgt', 'vina_score']
    for key in keys_to_add:
        new_data[key] = data[key]
    # print(data)
    # print(data.protein_element.view(-1, 1))
    # break
        
    data_lis.append(new_data)
  

In [6]:
data_lis = [[dic[key]  for key in dic.keys()] for dic in data_lis]



In [7]:
['protein_atom_feature', 'protein_atom_feature_mask', 'residue_feature', 'residue_feature_mask', 'ligand_smile', 'ligand_sas', 'ligand_logP', 'ligand_qed', 'ligand_tpsa', 'ligand_smiIndices_input', 'ligand_smiIndices_tgt', 'vina_score']

['protein_atom_feature',
 'protein_atom_feature_mask',
 'residue_feature',
 'residue_feature_mask',
 'ligand_smile',
 'ligand_sas',
 'ligand_logP',
 'ligand_qed',
 'ligand_tpsa',
 'ligand_smiIndices_input',
 'ligand_smiIndices_tgt',
 'vina_score']

In [8]:
class ProteinLigandDataset:
    def __init__(self, data_lis):
        self.data_lis = data_lis

    def __len__(self):
        return len(self.data_lis)
    
    def __getitem__(self, index):

        return self.data_lis[index]

In [9]:
from mindspore.dataset import GeneratorDataset

dataset = ProteinLigandDataset(data_lis)
# for i in dataset:
#     print(i)
#     break
ds = GeneratorDataset(dataset, 
                      ['protein_atom_feature', 'protein_atom_feature_mask', 'residue_feature', 'residue_feature_mask', 'ligand_smile', 'ligand_sas', 'ligand_logP', 'ligand_qed', 'ligand_tpsa', 'ligand_smiIndices_input', 'ligand_smiIndices_tgt', 'vina_score'], 
                      shuffle=False).batch(2)



In [None]:
for i in ds.create_tuple_iterator():
    print(i)

In [12]:
# 定义循环迭代
def inf_iterator(iterable):
    iterator = iterable.__iter__()
    while True:
        try:
            yield iterator.__next__()
        except StopIteration:
            iterator = iterable.__iter__()

In [None]:
from mindspore.nn import CrossEntropyLoss
from mindspore.experimental.optim import Adam
from mindspore.experimental.optim.lr_scheduler import ReduceLROnPlateau
optimizer = Adam(
            model.parameters(),
            lr=config.train.optimizer.lr,
            weight_decay=config.train.optimizer.weight_decay,
            betas=(config.train.optimizer.beta1, config.train.optimizer.beta2, )
            )

scheduler = ReduceLROnPlateau(
            optimizer,
            factor=config.train.scheduler.factor,
            patience=config.train.scheduler.patience,
            min_lr=config.train.scheduler.min_lr
        )

criterion = CrossEntropyLoss()