In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import os
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import StepLR, CosineAnnealingWarmRestarts, CyclicLR,_LRScheduler
from performance_eval import whole_eval_package

In [2]:
for i in range(5):
    tb_log_dir = './tb_log/Transformer_CrossValid_cross{}/'.format(i)
    if not os.path.exists(tb_log_dir):
        os.mkdir(tb_log_dir)

In [42]:

class CosineAnnealingWarmupRestarts(_LRScheduler):

    
    def __init__(self,
                 optimizer : torch.optim.Optimizer,
                 first_cycle_steps : int,
                 cycle_mult : float = 1.,
                 max_lr : float = 0.1,
                 min_lr : float = 0.001,
                 warmup_steps : int = 0,
                 gamma : float = 1.,
                 last_epoch : int = -1
        ):
        assert warmup_steps < first_cycle_steps
        
        self.first_cycle_steps = first_cycle_steps # first cycle step size
        self.cycle_mult = cycle_mult # cycle steps magnification
        self.base_max_lr = max_lr # first max learning rate
        self.max_lr = max_lr # max learning rate in the current cycle
        self.min_lr = min_lr # min learning rate
        self.warmup_steps = warmup_steps # warmup step size
        self.gamma = gamma # decrease rate of max learning rate by cycle
        
        self.cur_cycle_steps = first_cycle_steps # first cycle step size
        self.cycle = 0 # cycle count
        self.step_in_cycle = last_epoch # step size of the current cycle
        
        super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
        
        # set learning rate min_lr
        self.init_lr()

    def init_lr(self):
        self.base_lrs = []
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.min_lr
            self.base_lrs.append(self.min_lr)
    
    def get_lr(self):
        if self.step_in_cycle == -1:
            return self.base_lrs
        elif self.step_in_cycle < self.warmup_steps:
            return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.max_lr - base_lr) \
                    * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \
                                    / (self.cur_cycle_steps - self.warmup_steps))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.step_in_cycle = self.step_in_cycle + 1
            if self.step_in_cycle >= self.cur_cycle_steps:
                self.cycle += 1
                self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
                self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
        else:
            if epoch >= self.first_cycle_steps:
                if self.cycle_mult == 1.:
                    self.step_in_cycle = epoch % self.first_cycle_steps
                    self.cycle = epoch // self.first_cycle_steps
                else:
                    n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
                    self.cycle = n
                    self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
                    self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
            else:
                self.cur_cycle_steps = self.first_cycle_steps
                self.step_in_cycle = epoch
                
        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, num_heads=8, hidden_dim=200, num_layers=6):
        super(TransformerModel, self).__init__()
        gene2vec_weight = np.load('gene2vec.npy')
        gene2vec_weight = np.concatenate((gene2vec_weight, np.zeros((1, gene2vec_weight.shape[1]))), axis=0)
        gene2vec_weight = torch.from_numpy(gene2vec_weight.astype(np.float64))
        
        self.embedding = nn.Embedding.from_pretrained(gene2vec_weight)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads),
            num_layers=num_layers
        )
        self.fc1 = nn.Linear(1000, 128)
        self.fc2 = nn.Linear(128, output_dim)
    def forward(self, x):
        # output = self.embedding(x)
        # output = output.permute(1, 0, 2)  # 调整输入形状以适应Transformer模型
        # output = self.transformer_encoder(x)
        # output = output.mean(dim=0)  # 取平均值
        output = self.fc1(x.float())
        output = self.fc2(output)
        return output

# 自定义数据集类
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, data_file, label_file):
        self.data = pd.read_csv(data_file).iloc[:, 1:].to_numpy().astype(np.int64)  # Convert to torch.long
        self.labels = pd.read_csv(label_file).iloc[:,6].to_numpy().astype(np.int64)  # Convert to torch.long
        # self.data = torch.DoubleTensor(self.data)
        # self.labels= torch.DoubleTensor(self.labels)
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx],self.labels[idx]
        # return torch.tensor(self.data[idx], dtype=torch.double), torch.tensor(self.labels[idx], dtype=torch.double)




In [45]:
for i in range(5):
    train_data = 'lookupcsv/CrossValid/cross{}/GWAS_train.csv'.format(i)
    train_label = 'lookupcsv/CrossValid/cross{}/train.csv'.format(i)
    train_dataset = CustomDataset(train_data, train_label)

    valid_data = 'lookupcsv/CrossValid/cross{}/GWAS_valid.csv'.format(i)
    valid_label = 'lookupcsv/CrossValid/cross{}/valid.csv'.format(i)
    valid_dataset = CustomDataset(valid_data, valid_label)

    test_data = 'lookupcsv/CrossValid/cross{}/GWAS_test.csv'.format(i)
    test_label = 'lookupcsv/CrossValid/cross{}/test.csv'.format(i)
    test_dataset = CustomDataset(test_data, test_label)

    # 定义超参数
    input_dim = 1000  # 特征数
    output_dim = 2  # 类别数
    num_epochs = 100
    batch_size = 128

    # 创建数据加载器
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
    # 初始化模型、损失函数和优化器
    model = TransformerModel(input_dim, output_dim).to("cuda")
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    scheduler = CosineAnnealingWarmupRestarts(
        optimizer,
        first_cycle_steps=15,
        cycle_mult=2,
        max_lr=1e-4,
        min_lr=1e-6,
        warmup_steps=5,
        gamma=0.9
    )
    # 训练模型
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            inputs, labels = inputs.to("cuda"), labels.to("cuda")
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        if epoch %10==0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")
        scheduler.step()
    # 在测试集上评估模型
    
    
    model.eval()
    correct = 0
    total = 0
    all_outputs = []
    with torch.no_grad():
        for inputs, labels in train_loader:
            inputs, labels = inputs.to("cuda"), labels.to("cuda")
            outputs = model(inputs)
            # print(outputs.shape)
            _, predicted = torch.max(outputs, 1)
            test = predicted.unsqueeze(1)
            outputs = torch.cat((outputs,test),dim=1)
            all_outputs.append(outputs.cpu())
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    all_outputs_cat = torch.cat(all_outputs, dim=0)
    print(f"Accuracy on test set: {(100 * correct / total):.2f}%")
    wwc = pd.DataFrame(all_outputs_cat)
    wwc.columns = ['trans1','trans2','COG_pred']
    wwc.to_csv('./tb_log/Transformer_CrossValid_cross{}/train_eval.csv'.format(i))
    correct = 0
    total = 0
    all_outputs = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to("cuda"), labels.to("cuda")
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            test = predicted.unsqueeze(1)
            outputs = torch.cat((outputs,test),dim=1)
            all_outputs.append(outputs.cpu())
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    all_outputs_cat = torch.cat(all_outputs, dim=0)
    print(f"Accuracy on test set: {(100 * correct / total):.2f}%")
    wwc = pd.DataFrame(all_outputs_cat)
    wwc.columns = ['trans1','trans2','COG_pred']
    wwc.to_csv('./tb_log/Transformer_CrossValid_cross{}/test_eval.csv'.format(i))
    correct = 0
    total = 0
    all_outputs = []
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to("cuda"), labels.to("cuda")
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            test = predicted.unsqueeze(1)
            outputs = torch.cat((outputs,test),dim=1)
            # print(outputs.shape,predicted.shape)
            all_outputs.append(outputs.cpu())
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    all_outputs_cat = torch.cat(all_outputs, dim=0)
    print(f"Accuracy on test set: {(100 * correct / total):.2f}%")
    wwc = pd.DataFrame(all_outputs_cat)
    wwc.columns = ['trans1','trans2','COG_pred']
    wwc.to_csv('./tb_log/Transformer_CrossValid_cross{}/valid_eval.csv'.format(i))
    
    data = pd.read_csv('./tb_log/Transformer_CrossValid_cross{}/train_eval.csv'.format(i))
    data2 = pd.read_csv('lookupcsv/CrossValid/cross{}/train.csv'.format(i))['ADD']
    data['COG'] = data2
    data=data.fillna(0)
    data.to_csv('./tb_log/Transformer_CrossValid_cross{}/train_eval.csv'.format(i),index=False)
    
    data = pd.read_csv('./tb_log/Transformer_CrossValid_cross{}/test_eval.csv'.format(i))
    data2 = pd.read_csv('lookupcsv/CrossValid/cross{}/test.csv'.format(i))['ADD']
    data['COG'] = data2
    data=data.fillna(0)
    data.to_csv('./tb_log/Transformer_CrossValid_cross{}/test_eval.csv'.format(i),index=False)
    
    data = pd.read_csv('./tb_log/Transformer_CrossValid_cross{}/valid_eval.csv'.format(i))
    data2 = pd.read_csv('lookupcsv/CrossValid/cross{}/valid.csv'.format(i))['ADD']
    data['COG'] = data2
    data=data.fillna(0)
    data.to_csv('./tb_log/Transformer_CrossValid_cross{}/valid_eval.csv'.format(i),index=False)

Epoch 1/100, Loss: 0.6595209389925003
Epoch 11/100, Loss: 0.504757322371006
Epoch 21/100, Loss: 0.4821956902742386
Epoch 31/100, Loss: 0.44254909828305244
Epoch 41/100, Loss: 0.43279799073934555
Epoch 51/100, Loss: 0.40062373504042625
Epoch 61/100, Loss: 0.36234208196401596
Epoch 71/100, Loss: 0.33920204266905785
Epoch 81/100, Loss: 0.33145179226994514
Epoch 91/100, Loss: 0.3215184435248375
Accuracy on test set: 85.41%
Accuracy on test set: 74.83%
Accuracy on test set: 75.89%
Epoch 1/100, Loss: 0.6076017618179321
Epoch 11/100, Loss: 0.49539998918771744
Epoch 21/100, Loss: 0.4719289243221283
Epoch 31/100, Loss: 0.42323895171284676
Epoch 41/100, Loss: 0.41145552322268486
Epoch 51/100, Loss: 0.37342918664216995
Epoch 61/100, Loss: 0.3304347060620785
Epoch 71/100, Loss: 0.30570539087057114
Epoch 81/100, Loss: 0.2976483516395092
Epoch 91/100, Loss: 0.28742471151053905
Accuracy on test set: 89.36%
Accuracy on test set: 75.52%
Accuracy on test set: 75.52%
Epoch 1/100, Loss: 0.6772160977125168

Unnamed: 0.1,Unnamed: 0,trans1,trans2,COG_score,COG
0,0,-0.449643,0.496058,1.0,1.0
1,1,0.120944,0.148253,1.0,1.0
2,2,-0.276933,0.228604,1.0,1.0
3,3,0.048918,0.005690,0.0,1.0
4,4,0.531295,-0.387797,0.0,1.0
...,...,...,...,...,...
420,420,1.238858,-0.906573,0.0,0.0
421,421,2.393188,-1.505869,0.0,0.0
422,422,1.454986,-1.096195,0.0,0.0
423,423,1.876705,-1.252171,0.0,0.0


In [5]:
# from gensim.models import Word2Vec
# train_data = pd.read_csv('lookupcsv/CrossValid/cross0/GWAS_train.csv').iloc[:,1:].to_numpy().T
# # df = pd.read_csv(dataset_path+"train_data.csv")
# gene_names = [str(gene) for gene in range(train_data.shape[0])]
# model = Word2Vec([gene_names], vector_size=200, window=10, min_count=0, workers=4)
# np.save('gene2vec.npy',model.wv.vectors)
# # time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# # print('-----finished -----',time) 
# test_data = pd.read_csv('lookupcsv/CrossValid/cross0/GWAS_test.csv').iloc[:,1:].to_numpy()
# test_label = pd.read_csv('lookupcsv/CrossValid/cross0/test.csv').iloc[:,5].to_numpy()
# pred = model.predict(test_data)
# (pred==test_label).sum()/len(pred)

In [2]:
import pandas as pd

In [4]:
data_train = pd.read_csv('lookupcsv/CrossValid/cross0/train.csv')
train_data=[]

train_data.append(data_train.iloc[:,1:])
# for i, task in enumerate(self.tasks):
#     self.train_data[i] = self.preprocess_pipeline(self.train_data[i], task)
#     print('after preprocess pipeline, the data frame for the {} task is'.format(task))
#     print(self.train_data[i])
#     print('\n' * 2)