In [1]:
import pandas as pd

In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os

class PeptideDataset(Dataset):
    def __init__(self, files, split='train', transform=None):
        # 初始为空的列表，用于存储各个分割的数据
        train_data_list = []
        val_data_list = []
        test_data_list = []

        # 对每个文件单独进行划分
        for file in files:
            # 读取文件
            data = pd.read_csv(file, header=None, skiprows=1)
            # 取第一列和第二列数据
            data = data.iloc[:, :2]
            
            # 划分数据
            train_temp, temp_data = train_test_split(data, test_size=0.3, random_state=42)
            val_temp, test_temp = train_test_split(temp_data, test_size=1/3, random_state=42)
            
            # 追加到相应的列表中
            train_data_list.append(train_temp)
            val_data_list.append(val_temp)
            test_data_list.append(test_temp)

        # 合并来自所有文件的数据
        if split == 'train':
            self.data = pd.concat(train_data_list, ignore_index=True)
        elif split == 'val':
            self.data = pd.concat(val_data_list, ignore_index=True)
        elif split == 'test':
            self.data = pd.concat(test_data_list, ignore_index=True)
        
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        column1 = row[0]
        column2 = row[1]
        if self.transform:
            column1 = self.transform(column1)
            column2 = self.transform(column2)
        return (column1, column2)

# 用于字符到整数的映射
def text_transform(text):
    bos_token = 2
    # 首先将文本转换为字符的ascii值列表
    transformed_text = [ord(char) for char in text]
    return transformed_text

# 创建数据集实例
root = 'data'
files = [os.path.join(root, f) for f in os.listdir(root) if f.endswith('csv')]
train_dataset = PeptideDataset(files, split='train', transform=text_transform)
val_dataset = PeptideDataset(files, split='val', transform=text_transform)
test_dataset = PeptideDataset(files, split='test', transform=text_transform)


[4pdvGPU Msg(31121:140200917776192:libvgpu.c:870)]: Initializing.....


([89,
  72,
  84,
  69,
  89,
  82,
  69,
  73,
  67,
  65,
  75,
  84,
  68,
  69,
  78,
  73,
  65,
  89,
  76,
  78,
  89,
  72,
  68,
  89,
  84,
  87,
  65,
  86,
  76,
  65,
  89,
  69,
  87,
  89],
 [73,
  78,
  83,
  81,
  76,
  69,
  70,
  75,
  73,
  75,
  80,
  70,
  83,
  76,
  86,
  83,
  83,
  83,
  82,
  87,
  76,
  86,
  75,
  82,
  71])

In [8]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    pad_token = 1
    bos_token = 2
    
    # 处理batch中的每个样本，样本是(column1, column2)的形式
    batch_column1 = [torch.tensor([bos_token] + item[0] + [3]) for item in batch]  # 对第一列应用转换
    batch_column2 = [torch.tensor([bos_token] + item[1] + [3]) for item in batch]  # 对第二列应用转换
    
    # 对两列数据进行padding
    column1_padded = pad_sequence([torch.tensor(x) for x in batch_column1], 
                            padding_value=pad_token, batch_first=True)
    column2_padded = pad_sequence([torch.tensor(x) for x in batch_column2], 
                            padding_value=pad_token, batch_first=True)
    
    return column1_padded.T, column2_padded.T

# 示例：使用collate_fn生成dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [9]:
for src, tgt in train_loader:
    print(src)
    print(tgt)
    break

tensor([[ 2,  2,  2,  ...,  2,  2,  2],
        [89, 89, 89,  ..., 89, 89, 89],
        [68, 89, 72,  ..., 72, 70, 89],
        ...,
        [87, 87, 87,  ..., 87, 87, 87],
        [89, 72, 89,  ..., 89, 89, 89],
        [ 3,  3,  3,  ...,  3,  3,  3]])
tensor([[ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2],
        [75, 77, 77, 82, 70, 69, 86, 87, 81, 69, 82, 84, 72, 65, 84, 86, 83, 82,
         81, 77, 82, 75, 76, 69, 84, 80, 75, 83, 86, 76, 77, 70],
        [70, 40, 82, 86, 83, 71, 76, 75, 76, 72, 73, 76, 76, 76, 87, 81, 80, 71,
         68, 76, 76, 82, 84, 76, 80, 84, 77, 65, 73, 83, 81, 86],
        [69, 43, 76, 70, 78, 71, 86, 82, 75, 83, 83, 87, 83, 82, 70, 87, 76, 80,
         67, 80, 75, 77, 80, 77, 82, 82, 69, 70, 81, 83, 77, 69],
        [69, 49, 80, 69, 75, 86, 65, 76, 81, 78, 72, 69, 71, 65, 65, 71, 83, 80,
         70, 86, 71, 65, 75, 86, 83, 87, 76, 71, 76, 78, 78, 87],
        [81

  column1_padded = pad_sequence([torch.tensor(x) for x in batch_column1],
  column2_padded = pad_sequence([torch.tensor(x) for x in batch_column2],
