<a href="https://colab.research.google.com/github/karim-khlf/HTML_CSS_Website/blob/main/protein_family_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/bayes-group-diffusion/protein_families/" + splits["train"])
df.head(10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,sequence,family
0,MVDISLIMGSESDRAIANRAVSVIEKTKYTYEVMVISAHRNPDELD...,PurE
1,MGGKSDLEHLQPAIDILAELRIPHEVRIVSAHRTPDWMMEYASSAE...,PurE
2,MSAAPVAVIMGSQSDWTTMRHCADTLEALGIAFETLIVSAHRTPDR...,PurE
3,MENPPVGIIMGSQSDWATMREAATLLDELGIAYEAKIVSAHRTPDR...,PurE
4,MTKKVGIIMGSDSDLPIVEKAINTLVEYDVPFEVHVFSAHRTPDEA...,PurE
5,MGSDSDWTVMIAAAALLREFGVAYEVEVLSAHRTPQKMLAYGMDAR...,PurE
6,MTNKYKVAVVMGSDSDFPILKKCIKILKEFSVETEVHVISAHRTPS...,PurE
7,MQVAIFFGSKSDTEVMRGAANALKEFGIEYKAFVLSAHRVPEKLEE...,PurE
8,MPEVAILMGSKSDTSIAEKTSMALEEAGIAHEMRVISAHRNPDELD...,PurE
9,MTSNHSAPLVGIVMGSRSDWETMQHAAQKLDALGVPYEVKVVSAHR...,PurE


In [2]:
missing_sequences = sum(seq is None or seq == "" for seq in df["sequence"])
missing_families  = sum(fam is None or fam == "" for fam in df["family"])
print(missing_sequences, missing_families)


0 0


In [3]:
lengths = [len(seq) for seq in df["sequence"]]

print("Minimum length:", min(lengths))
print("Maximum length:", max(lengths))
print("Average length:", sum(lengths)/len(lengths))

Minimum length: 55
Maximum length: 1617
Average length: 182.85264644142342


In [4]:
# define the amino-acide vocabulary
AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWYBJZX-"
aa_to_idx = {aa: i+1 for i, aa in enumerate(AMINO_ACIDS)}
aa_to_idx["X"] = len(aa_to_idx) + 1 #hdy fi 7alet amino-acids unconnu
print(aa_to_idx)

{'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'B': 21, 'J': 22, 'Z': 23, 'X': 26, '-': 25}


In [10]:
# padding and truncating
import torch

MAX_LEN = 200 #since the average is 182

def encode_sequence(seq, aa_to_idx, max_len=MAX_LEN):
    # Convert amino acids to integers
    encoded = [aa_to_idx.get(aa, aa_to_idx["X"]) for aa in seq]

    # Truncate if too long
    encoded = encoded[:max_len]

    # Pad if too short
    if len(encoded) < max_len:
        encoded += [0] * (max_len - len(encoded))  # 0 = padding

    return torch.tensor(encoded, dtype=torch.long)

sequences = df["sequence"]
labels  =df["family"]
encoded_sequences = [encode_sequence(seq, aa_to_idx, max_len=200) for seq in sequences]

In [14]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

print("Unique families:", len(label_encoder.classes_))


Unique families: 8


In [16]:
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split



class ProteinDataset(Dataset):
    def __init__(self, sequences, labels, aa_to_idx, max_len=200):
        self.sequences = sequences
        self.labels = labels
        self.aa_to_idx = aa_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        label = self.labels[idx]
        return encode_sequence(seq, self.aa_to_idx, self.max_len), torch.tensor(label, dtype=torch.long)

train_seqs, val_seqs, train_labels, val_labels = train_test_split(
    sequences.tolist(),
    encoded_labels.tolist(),
    test_size=0.2,
    random_state=42,
    stratify=encoded_labels   # keeps family distribution balanced
)

train_dataset = ProteinDataset(
    sequences=train_seqs,
    labels=train_labels,
    aa_to_idx=aa_to_idx,
    max_len=200
)
val_dataset = ProteinDataset(
    sequences=val_seqs,
    labels=val_labels,
    aa_to_idx=aa_to_idx,
    max_len=200
)


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


for batch_sequences, batch_labels in train_loader:
    print(batch_sequences.shape)  # torch.Size([32, 200])
    print(batch_labels.shape)     # torch.Size([32])
    break



torch.Size([32, 200])
torch.Size([32])


In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ProteinCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(ProteinCNN, self).__init__()

        # Embedding for amino acids
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # Convolutional layers
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=8, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=7, padding=3)

        # Fully connected layers
        self.fc1 = nn.Linear(16, 8)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(8, num_classes)

    def forward(self, x):
        # x shape: (batch_size, seq_len)
        x = self.embedding(x)                 # (batch_size, seq_len, embed_dim)
        x = x.permute(0, 2, 1)                # (batch_size, embed_dim, seq_len) for Conv1d

        x = F.relu(self.conv1(x))             # (batch_size, 128, seq_len)
        x = F.max_pool1d(x, kernel_size=2)    # downsample

        x = F.relu(self.conv2(x))             # (batch_size, 256, seq_len/2)
        x = F.adaptive_max_pool1d(x, 1)       # global max pooling → (batch_size, 256, 1)

        x = x.squeeze(-1)                     # (batch_size, 256)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)                      # (batch_size, 128)
        x = self.fc2(x)                       # (batch_size, num_classes)

        return x


model = ProteinCNN(vocab_size=25, embed_dim=50, num_classes=len(label_encoder.classes_))



In [28]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    for sequences, labels in loop:
        sequences, labels = sequences.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(sequences)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        loop.set_postfix(loss=total_loss/len(train_loader), acc=correct/total)

    train_acc = correct / total
    train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_correct, val_total, val_loss = 0, 0, 0
    with torch.no_grad():
        for sequences, labels in val_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_acc = val_correct / val_total
    val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1} | "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")




Epoch 1 | Train Loss: 0.9761, Train Acc: 0.6500 | Val Loss: 0.3344, Val Acc: 0.8705




Epoch 2 | Train Loss: 0.7164, Train Acc: 0.7567 | Val Loss: 0.2891, Val Acc: 0.9207




Epoch 3 | Train Loss: 0.6151, Train Acc: 0.7807 | Val Loss: 0.2736, Val Acc: 0.9203




Epoch 4 | Train Loss: 0.5554, Train Acc: 0.7929 | Val Loss: 0.2088, Val Acc: 0.9257




Epoch 5 | Train Loss: 0.5239, Train Acc: 0.7970 | Val Loss: 0.1754, Val Acc: 0.9431




Epoch 6 | Train Loss: 0.5144, Train Acc: 0.7818 | Val Loss: 0.1741, Val Acc: 0.9427




Epoch 7 | Train Loss: 0.4686, Train Acc: 0.8174 | Val Loss: 0.1400, Val Acc: 0.9498




Epoch 8 | Train Loss: 0.4470, Train Acc: 0.8341 | Val Loss: 0.1502, Val Acc: 0.9468




Epoch 9 | Train Loss: 0.4374, Train Acc: 0.8365 | Val Loss: 0.1312, Val Acc: 0.9608




Epoch 10 | Train Loss: 0.4292, Train Acc: 0.8418 | Val Loss: 0.1221, Val Acc: 0.9665
