# Procedure

### 1) Predict Protein structure using just Amino Acid Sequence using best approach (model 3 with 7  layers)
### 2) Analyze results and see if it align with the ones in paper
### 3) Predict protein structure dictation by merging primary structure and one protein property using best approach
### 4) Analyze results and see if it align with the ones in paper

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Libraries

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, random_split
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os

In [2]:
os.chdir("/content/drive/MyDrive/Project")

### Data Preparation, preprocessing

In [3]:
class ProteinDataset(Dataset):
    def __init__(self, feat_dir, label_dir):

        data = []
        with open(feat_dir, 'r') as file:
            print (feat_dir)
            for line in file:
                if not line.startswith('#'):
                    x = [float(x) if x.replace('.','',1).isdigit() else x for x in line.split()]
                    x.extend([0]* (2060 - len(x)))
                    new_lis = []
                    for it in x:
                        lis = [0]*24
                        lis[int(it)] = 1
                        new_lis.append(lis)
                    data.append(new_lis)
        self.features=data

        label = []
        with open(label_dir, 'r') as file:
            for line in file:
                if not line.startswith('#'):
                    x = [float(x) if x.replace('.','',1).isdigit() else x for x in line.split()]
                    x.extend([0]*(2060-len(x)))
                    label.append(x)

        self.labels = label

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx]).transpose(0,1).float(), torch.tensor(self.labels[idx])

In [4]:
big = ProteinDataset(feat_dir="4Protein 3-2/data/aa1.dat",label_dir="4Protein 3-2/data/dssp.lab.tag.dat")

4Protein 3-2/data/aa1.dat


### Function to split the dataset into training, validation, and test sets

In [5]:
def split_data(dataset):
    total_size = len(dataset)
    train_size = int(0.6 * total_size)
    valid_size = int(0.1 * total_size)
    test_size = total_size - train_size - valid_size

    train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

    return train_loader, valid_loader, test_loader

### Define the CNN Model

In [27]:
import torch
import torch.nn as nn

class CustomModel(nn.Module):
    def __init__(self, in_channels):
        super(CustomModel, self).__init__()

        # Convolutional 1D Layer 1
        self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=256, kernel_size=17, padding=8)  # padding to keep length the same
        self.prelu1 = nn.PReLU()

        # Convolutional 1D Layer 2
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=16, padding=8)  # corrected padding to maintain length
        self.prelu2 = nn.PReLU()

        # Convolutional 1D Layer 3
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=15, padding=7)  # padding as (kernel_size-1)/2
        self.prelu3 = nn.PReLU()

        # Dropout layer
        self.dropout = nn.Dropout(p=0.5)

        # Fully Connected Layer
        self.fc = nn.Linear(64 * 2060, 4)  # Ensuring this matches the flattened size
        self.prelu_fc = nn.PReLU()

    def forward(self, x):
        x = self.prelu1(self.conv1(x))
        x = self.prelu2(self.conv2(x))
        x = self.prelu3(self.conv3(x))
        x = self.dropout(x)
        x = x.view(x.size(0), -1)  # Ensure flattening to the correct size
        x = self.prelu_fc(self.fc(x))
        return x

### Train the Model with an appropriate optimizer and loss function, monitoring the validation loss to avoid overfitting

#### for big

In [7]:
train_loader, valid_loader, test_loader = split_data(big)

In [28]:
model = CustomModel(in_channels=24)

In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


epochs = 10
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for data, target in train_loader:
        print(data.shape)
        print(target.shape)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data, target in valid_loader:
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()
    val_loss /= len(valid_loader)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

test_loader = DataLoader(test_loader, batch_size=10, shuffle=False)

model.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for data, target in test_loader:
        output = model(data)
        loss = criterion(output, target)
        test_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()
test_loss /= len(test_loader)
test_accuracy = 100 * correct / total

print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')

torch.Size([64, 24, 2060])
torch.Size([64, 2060])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x131904 and 131840x4)

#### for small

### Evaluate the Model

### Test the model's performance on the test set

### Record key metrics like accuracy and loss