In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install torch

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
import torch

import torch
from torch.utils.data import Dataset
import pandas as pd
import os

# 假设已经定义了一个函数encode_label将标签编码为整数

class ProteinDataset(Dataset):
    def __init__(self, sequences_csv, labels_csv, pssm_dir):
        self.labels_df = pd.read_csv(labels_csv)
        self.sequences_df = pd.read_csv(sequences_csv)
        self.pssm_dir = pssm_dir
#         self.pdb_ids = self.labels_df['PDB_ID'].unique()


    def __len__(self):
        return len(self.sequences_df)

    def __getitem__(self, idx):
#         pdb_id = self.pdb_ids[idx]
        pdb_id = self.sequences_df.iloc[idx]['PDB_ID']
        sequence = self.sequences_df.iloc[idx]['SEQUENCE']
        label = self.labels_df[self.labels_df['PDB_ID'] == pdb_id]['SEC_STRUCT'].values[0]
        
        # 加载PSSM文件
        pssm_path = os.path.join(self.pssm_dir, f"{pdb_id}_train.csv")
        pssm_df = pd.read_csv(pssm_path, usecols=lambda column : column not in ["RES_NUM", "AMINO_ACID"])
        # 转换为数值类型，并填充NaN值
        pssm_df = pssm_df.apply(pd.to_numeric, errors='coerce').fillna(0)
        pssm_tensor = torch.tensor(pssm_df.values, dtype=torch.float32).T  # 转置以匹配模型输入 [sequence_length, num_amino_acids]

        # 编码标签
        label_encoded = encode_label(label)
        
        return pssm_tensor, label_encoded

# 一个示例encode_label函数，您需要根据实际标签进行调整
def encode_label(label):
    label_dict = {'H': 0, 'E': 1, 'C': 2}  # 示例的标签字典
    return torch.tensor([label_dict[aa] for aa in label], dtype=torch.long)



In [None]:
import torch.nn as nn
import torch.nn.functional as F

class FullyConvNet(nn.Module):
    def __init__(self, num_classes):
        super(FullyConvNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=20, out_channels=64, kernel_size=3, padding=1)
        # 后续层保持不变...

        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(128)
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(256)
        self.final_conv = nn.Conv1d(256, num_classes, kernel_size=1)

    def forward(self, x):
        # Assuming `x` is your input tensor with shape [32, 1231, 20]
        x = x.permute(0, 2, 1)  # Permute to get [batch_size, num_channels, sequence_length]

        # Now, `x` has the shape [32, 20, 1231], which matches the expected input shape of the Conv1D layer
#         x = F.relu(self.bn1(self.conv1(x)))

        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.final_conv(x)
        return F.log_softmax(x, dim=1)  # 使用log_softmax为后续的NLLLoss准备


In [None]:
import matplotlib.pyplot as plt
# def calculate_accuracy(predictions, targets):
#     # Get the predicted class for each sequence
#     _, predicted_classes = predictions.max(dim=1)
#     # Compare with the target class
#     correct = (predicted_classes == targets).sum().item()
#     # Compute accuracy
#     accuracy = correct / targets.size(0)
#     return accuracy
def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs.data, 1)
    correct = (predicted == labels).sum().item()
    return correct / labels.size(0)



def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    train_losses = []
    train_accuracy = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        total_correct = 0
        total_samples = 0

        for sequences, labels in train_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            
            labels_flat = labels.view(-1)
            output_flat = outputs.view(-1, num_classes)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
#             _, predicted = torch.max(outputs.data, 1)
#             total_correct += (predicted == labels).sum().item()
#             total_samples += labels.size(0)
             # 忽略填充位置后计算准确率
            _, predicted = torch.max(output_flat, 1)
            mask = labels_flat != -1  # 创建一个掩码，以忽略填充的位置
            total_correct += (predicted[mask] == labels_flat[mask]).sum().item()
            total_samples += mask.sum().item()

        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = total_correct / total_samples
        train_losses.append(epoch_loss)
        train_accuracy.append(epoch_accuracy)
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")
    
    
    # 绘制损失和准确率曲线
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.title('Loss during training')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accuracy, label='Train Accuracy')
    plt.title('Accuracy during training')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()


In [None]:
from torch.utils.data import DataLoader
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt

def collate_fn(batch):
    sequences, labels = zip(*batch)

    # Your sequence handling code here...
    sequences_transposed = [seq.transpose(0, 1) for seq in sequences]  # This might already be correct depending on your data
    sequences_padded = pad_sequence(sequences_transposed, batch_first=True, padding_value=0)

    # Handling variable-length label sequences
    # Convert labels to tensors if they aren't already
#     labels_tensors = [torch.tensor(label, dtype=torch.long) for label in labels]
    # Adjusted line to create new tensors from existing ones
    labels_tensors = [label.clone().detach().long() for label in labels]

    # Pad label sequences so they all have the same length
    labels_padded = pad_sequence(labels_tensors, batch_first=True, padding_value=-1)  # Assuming -1 is an appropriate padding value for your task

    return sequences_padded, labels_padded



# 假设您已经定义了ProteinDataset类

# 假设您的数据和PSSM文件存放在指定目录下
sequences_csv = '/kaggle/input/deep-learning-for-msc-202324/seqs_train.csv'
labels_csv = '/kaggle/input/deep-learning-for-msc-202324/labels_train.csv'
pssm_dir = '/kaggle/input/deep-learning-for-msc-202324/train/'

# 初始化数据集和数据加载器
train_dataset = ProteinDataset(sequences_csv, labels_csv, pssm_dir)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# 现在，在创建DataLoader时使用这个collate_fn
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# 定义设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 初始化模型
num_classes = 3  # 假设有3个类别：Helix, Sheet, Coil
model = FullyConvNet(num_classes).to(device)  # 这行加在这里

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 5
train_model(model, train_loader, criterion, optimizer, num_epochs, device)


In [None]:
def save_model_and_optimizer(model, optimizer, save_path="model_checkpoint.pth"):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, save_path)
    print(f"Model and optimizer state dicts saved to {save_path}")
    
# 保存模型和优化器的状态
save_model_and_optimizer(model, optimizer, "model_checkpoint.pth")


In [None]:
def load_model_and_optimizer(model, optimizer, load_path="model_checkpoint.pth"):
    checkpoint = torch.load(load_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Model and optimizer state dicts loaded from {load_path}")
num_classes =3
model = FullyConvNet(num_classes).to(device)  # 重新创建模型实例并移到设备上
optimizer = optim.Adam(model.parameters(), lr=1e-5)  # 重新创建优化器实例

# 加载模型和优化器状态
checkpoint = torch.load("/kaggle/working/model_checkpoint.pth", map_location=device)  # 确保加载到正确的设备
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# 对于优化器，需要手动确保优化器内部状态也在正确的设备上
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.to(device)

# 准备记录损失和准确率
train_losses = []
train_accuracy = []

# 继续训练
num_epochs = 100
train_model(model, train_loader, criterion, optimizer, num_epochs, device)


## Ray Tune

In [None]:
# # !pip install ray[tune]
# !pip install -U ipywidgets

In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
# import torch_xla
# import torch_xla.debug.metrics as met
# import torch_xla.distributed.parallel_loader as pl
# import torch_xla.utils.utils as xu
# import torch_xla.core.xla_model as xm
# import torch_xla.distributed.xla_multiprocessing as xmp
# import torch_xla.test.test_utils as test_utils

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
# from ray import tune
# from ray import train  # 导入ray.train
# import os
# import torch
# import torch.nn.functional as F
# import torch.nn as nn
# from torch.utils.data import DataLoader

# # 更新的训练函数
# def train_protein(config):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
#     model = FullyConvNet(input_channels=41, num_classes=3).to(device)
#     optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
#     criterion = nn.CrossEntropyLoss()
    
#     # 使用ray.train.get_checkpoint()获取检查点
#     checkpoint = train.get_checkpoint()
#     if checkpoint:
#         with checkpoint.as_directory() as checkpoint_dir:
#             checkpoint_path = os.path.join(checkpoint_dir, "checkpoint.pt")
#             model_state, optimizer_state = torch.load(checkpoint_path)
#             model.load_state_dict(model_state)
#             optimizer.load_state_dict(optimizer_state)
    
#     train_loader = DataLoader(train_dataset, batch_size=int(config["batch_size"]), shuffle=True)

#     for epoch in range(10):  # 可以根据需要调整epoch数量
#         total_loss = 0
#         model.train()
#         for sequences, labels in train_loader:
#             sequences, labels = sequences.to(device), labels.to(device)
#             optimizer.zero_grad()
#             output = model(sequences)
#             loss = criterion(output, labels)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()

#         # 在Ray Tune中报告性能指标
#         tune.report(loss=total_loss/len(train_loader))



In [None]:
# from ray.tune import CLIReporter
# from ray.tune.schedulers import ASHAScheduler

# def tune_hyperparameters(num_samples=10, max_num_epochs=10, gpus_per_trial=1):
#     data_dir = os.path.abspath("./data")  # 根据实际情况调整数据目录
#     config = {
#         "lr": tune.loguniform(1e-4, 1e-1),
#         "batch_size": tune.choice([16, 32, 64, 128])
#     }

#     scheduler = ASHAScheduler(
#         metric="loss",
#         mode="min",
#         max_t=max_num_epochs,
#         grace_period=1,
#         reduction_factor=2)
    
#     reporter = CLIReporter(
#         metric_columns=["loss", "training_iteration"])
    
#     result = tune.run(
#         tune.with_parameters(
#             train_protein,
#             data_dir=data_dir),
#         resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
#         config=config,
#         num_samples=num_samples,
#         scheduler=scheduler,
#         progress_reporter=reporter)
    
#     best_trial = result.get_best_trial("loss", "min", "last")
#     print("Best trial config: {}".format(best_trial.config))
#     print("Best trial final validation loss: {}".format(
#         best_trial.last_result["loss"]))
    
#     # 可以根据需要加载最佳模型并进一步处理
#     # best_trained_model = FullyConvNet(input_channels=41, num_classes=3)
#     # best_checkpoint_dir = best_trial.checkpoint.value
#     # checkpoint_path = os.path.join(best_checkpoint_dir, "checkpoint")
#     # model_state, optimizer_state = torch.load(checkpoint_path)
#     # best_trained_model.load_state_dict(model_state)

# if __name__ == "__main__":
#     # 这里假设你已经定义了train_dataset或其他相关变量
#     tune_hyperparameters(num_samples=10, max_num_epochs=10, gpus_per_trial=1)
#     # 使用最佳超参数重新训练模型
    
#     best_lr = best_trial.config["lr"]
#     best_batch_size = best_trial.config["batch_size"]

#     train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

#     # 重新初始化模型
#     model = FullyConvNet(input_channels=41, num_classes=3).to(device)
#     optimizer = torch.optim.Adam(model.parameters(), lr=best_lr)
#     criterion = nn.CrossEntropyLoss()

#     # 重新训练模型
#     train_model(model, train_loader, criterion, optimizer, num_epochs=10, device=device)

#     # 在这里添加代码以在验证集上评估模型性能


## PREDICTION

In [None]:
import torch
import pandas as pd
import os
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import csv

# 保留之前定义的encode_amino_acid和encode_sequence函数

class TestProteinDataset(Dataset):
    def __init__(self, sequences_csv, test_dir):
        self.sequences_df = pd.read_csv(sequences_csv)
        self.pssm_dir = test_dir
        

    def __getitem__(self, idx):
        pdb_id = self.sequences_df.iloc[idx]['PDB_ID']
        sequence = self.sequences_df.iloc[idx]['SEQUENCE']
        
        # 加载PSSM文件
        pssm_path = os.path.join(self.pssm_dir, f"{pdb_id}_test.csv")
        pssm_df = pd.read_csv(pssm_path, usecols=lambda column : column not in ["RES_NUM", "AMINO_ACID"])
        # 转换为数值类型，并填充NaN值
        pssm_df = pssm_df.apply(pd.to_numeric, errors='coerce').fillna(0)
        pssm_tensor = torch.tensor(pssm_df.values, dtype=torch.float32).T  # 转置以匹配模型输入 [sequence_length, num_amino_acids]

        
        return pssm_tensor, pdb_id
    
    def __len__(self):
        return len(self.sequences_df)


def predict_collate_fn(batch):
    sequences, pdb_ids = zip(*batch)
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    return sequences_padded, pdb_ids


def predict(model, loader, device):
    model.eval()
    predictions = {}

    with torch.no_grad():
        for sequences, pdb_ids in loader:
            sequences = sequences.to(device)
            sequences = sequences.permute(0, 2, 1)  # 将sequences的维度从[batch_size, sequence_length, channels]转换为[batch_size, channels, sequence_length]

            output = model(sequences)  # 假设输出维度为 (batch_size, num_classes, sequence_length)

            # 由于batch_size=1，直接处理每个batch
            for i, pdb_id in enumerate(pdb_ids):
                output_seq = output[i]  # 输出形状应为 (num_classes, sequence_length)
                _, predicted_seq = torch.max(output_seq, dim=0)  # 对每个位置取最大值获取类别
                
                # 保存预测结果
                if pdb_id not in predictions:
                    predictions[pdb_id] = []
                predictions[pdb_id].extend(predicted_seq.cpu().numpy())

    return predictions

# 以下是你之前的代码
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load('/kaggle/working/model_checkpoint.pth')
num_classes = 3
model = FullyConvNet(num_classes).to(device)
model.load_state_dict(checkpoint['model_state_dict'])

test_dataset = TestProteinDataset(sequences_csv='/kaggle/input/deep-learning-for-msc-202324/seqs_test.csv', test_dir='/kaggle/input/deep-learning-for-msc-202324/test/')
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=predict_collate_fn)

predictions = predict(model, test_loader, device)
print(len(predictions))


output_path = '/kaggle/working/submission.csv'

with open(output_path, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['ID', 'STRUCTURE'])
    for pdb_id, pred_labels in predictions.items():
        for residue_index, residue_prediction in enumerate(pred_labels):
            residue_id = f'{pdb_id}_{residue_index + 1}'  # 构建残基ID，索引从1开始
            prediction_label = 'C' if residue_prediction == 0 else ('H' if residue_prediction == 1 else 'E')
            # 将每个残基的预测标签写入CSV文件
            csvwriter.writerow([residue_id, prediction_label])

print(f"Predictions have been saved to {output_path}")
