In [39]:
import numpy as np
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch import optim

from gnn_data import GNN_DATA
from pipr_models import PIPRModel

# 走通流程

In [6]:

ppi_data = GNN_DATA(ppi_path='./protein_info/protein.actions.SHS27k.STRING.pro2.txt')

54813it [00:00, 398251.08it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1109595.42it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1332668.51it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1666113.84it/s]


In [8]:
pseq_path = "./protein_info/protein.SHS27k.sequences.dictionary.pro3.tsv"
vec_path = "./protein_info/vec5_CTC.txt"
ppi_data.get_feature_origin(pseq_path=pseq_path,
                                vec_path=vec_path)

1553it [00:00, 221888.34it/s]


protein num: 1553
protein average length: 528.5151320025757
protein max & min length: 2517, 51
acid vector dimension: 13


100%|██████████| 1553/1553 [00:00<00:00, 2878.00it/s]
100%|██████████| 1553/1553 [00:00<?, ?it/s]


In [77]:
ppi_label_list = ppi_data.ppi_label_list
ppi_label_list = ppi_label_list[:int(len(ppi_label_list) / 2)]  # ppi pair的label
ppi_list = ppi_data.ppi_list
ppi_list = ppi_list[:int(len(ppi_list) / 2)]  # ppi pair分别记录索引
name2idx = ppi_data.protein_name  # 蛋白名字-索引
idx2name = dict(zip(name2idx.values(), name2idx.keys()))  # 索引-蛋白质
protein_dict = ppi_data.protein_dict  # 蛋白质-序列向量

走通了！

In [104]:
for ppi_index, pair in enumerate(ppi_list):
    p1 = pair[0]
    p2 = pair[1]
    seq1 = protein_dict[idx2name[p1]]
    seq2 = protein_dict[idx2name[p2]]
    seq1 = torch.from_numpy(seq1).float().unsqueeze(0).transpose(1, 2)
    seq2 = torch.from_numpy(seq2).float().unsqueeze(0).transpose(1, 2)
    output = model(seq1, seq2)
    label = ppi_label_list[ppi_index]
    break


In [111]:
loss_fn = nn.BCEWithLogitsLoss()
label = torch.tensor([label])
output

tensor([[-0.1660,  0.1409,  0.1192, -0.0455,  0.1286, -0.0395, -0.0960]],
       grad_fn=<AddmmBackward0>)

In [113]:
loss_fn(output, label.float())

tensor(0.7070, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

# 写成Dataset方便loader读取

In [50]:
class SHS27K(Dataset):
    def __init__(self, ppi_path, pseq_path, vec_path, train_valid_index_path, TRAIN=True):
        super(SHS27K, self).__init__()
        self.TRAIN = TRAIN  # 是训练集还是验证集
        # 这里调用HIGH-PPI项目写好的数据接口
        ppi_data = GNN_DATA(ppi_path)
        ppi_data.get_feature_origin(pseq_path=pseq_path,
                                vec_path=vec_path)
        # 三个字典，方便通过索引获取蛋白质对应的array
        name2idx = ppi_data.protein_name  # 字典，蛋白名字-索引
        self.idx2name = dict(zip(name2idx.values(), name2idx.keys()))  # 反转键值，索引-蛋白
        self.protein_dict = ppi_data.protein_dict  # 蛋白-序列向量
        
        # ppi_list和ppi_label_list是同等长度的，相同索引是对应关系
        ppi_list = ppi_data.ppi_list
        ppi_list = ppi_list[:int(len(ppi_list) / 2)]  # e.g. [0, 1]
        ppi_label_list = ppi_data.ppi_label_list
        ppi_label_list = np.array(ppi_label_list[:int(len(ppi_label_list) / 2)])  # e.g. [0, 1]对应的multi-label
        
        # 读取HIGH-PPI的数据集划分文件
        f = open(train_valid_index_path, "r")
        train_valid_index = json.load(f)
        train_index = train_valid_index['train_index']
        valid_index = train_valid_index['valid_index']
        # 拿到划分后的训练集和验证集
        self.train_ppi_list = [ppi_list[index] for index in train_index]
        self.train_ppi_label_list = [ppi_label_list[index] for index in train_index]
        self.valid_ppi_list = [ppi_list[index] for index in valid_index]
        self.valid_ppi_label_list = [ppi_label_list[index] for index in valid_index]
        
        
    def __getitem__(self, ppi_index):
        label, p1, p2 = None, None, None
        if self.TRAIN:
            label = self.train_ppi_label_list[ppi_index]  # 标签
            p1, p2 = self.train_ppi_list[ppi_index]  # 取出ppi
        else:
            label = self.valid_ppi_label_list[ppi_index]  # 标签
            p1, p2 = self.valid_ppi_list[ppi_index]  # 取出ppi
        seq1 = self.protein_dict[self.idx2name[p1]]
        seq2 = self.protein_dict[self.idx2name[p2]]
        pro2seq = {"p1": seq1, "p2": seq2}
        
        return pro2seq, label
    
    def __len__(self):
        if self.TRAIN:
            return len(self.train_ppi_list)
        else:
            return len(self.valid_ppi_list)

In [51]:
ppi_path='./protein_info/protein.actions.SHS27k.STRING.pro2.txt'
pseq_path = "./protein_info/protein.SHS27k.sequences.dictionary.pro3.tsv"
vec_path = "./protein_info/vec5_CTC.txt"
train_valid_index_path = "./train_val_split_data/train_val_split_27.json"
train_dataset = SHS27K(ppi_path, pseq_path, vec_path, train_valid_index_path)
valid_dataset = SHS27K(ppi_path, pseq_path, vec_path, train_valid_index_path, TRAIN=False)

54813it [00:00, 285488.41it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1109992.24it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1332859.27it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1110124.57it/s]
1553it [00:00, 221888.34it/s]


protein num: 1553
protein average length: 528.5151320025757
protein max & min length: 2517, 51
acid vector dimension: 13


100%|██████████| 1553/1553 [00:00<00:00, 3253.56it/s]
100%|██████████| 1553/1553 [00:00<00:00, 1572231.26it/s]
54813it [00:00, 398230.39it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1110080.46it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1665617.12it/s]
100%|██████████| 6660/6660 [00:00<00:00, 1332160.08it/s]
1553it [00:00, 221895.90it/s]


protein num: 1553
protein average length: 528.5151320025757
protein max & min length: 2517, 51
acid vector dimension: 13


100%|██████████| 1553/1553 [00:00<00:00, 2724.56it/s]
100%|██████████| 1553/1553 [00:00<00:00, 1550893.84it/s]


In [60]:
train_loader = DataLoader(dataset=train_dataset, batch_size=2, shuffle=True)
loss_fn = nn.BCEWithLogitsLoss()
model = PIPRModel(input_dim=13, hidden_dim=50, class_num=7).float()
optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True, eps=1e-5)
for pro2seq, label in train_loader:
    seq1 = pro2seq["p1"].transpose(1, 2).float()
    seq2 = pro2seq["p2"].transpose(1, 2).float()
    output = model(seq1, seq2)
    label = label.float()
    loss = loss_fn(output, label)
    print(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


tensor(0.6829, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6790, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6858, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6958, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6929, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6812, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6911, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6878, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6660, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6406, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6977, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.8059, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6159, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6248, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6791, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6735, grad_fn=<BinaryCrossEntro

KeyboardInterrupt: 