In [3]:
!pip install git+https://github.com/exobyte-labs/betamark.git
!pip install genomic-benchmarks torch
!pip install scikit-learn
!pip install --upgrade jupyter ipywidgets



Collecting git+https://github.com/exobyte-labs/betamark.git
  Cloning https://github.com/exobyte-labs/betamark.git to /tmp/pip-req-build-2iyokf29
  Running command git clone --filter=blob:none --quiet https://github.com/exobyte-labs/betamark.git /tmp/pip-req-build-2iyokf29

  Resolved https://github.com/exobyte-labs/betamark.git to commit 5a07b805cf38d01f2412ed7468604de83be5e740
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.2.2-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Downloading nbconvert-7.16.4-p

In [11]:
import torch  # Main library for building and training neural networks
import torch.nn as nn  # Provides essential neural network layers
import torch.optim as optim  # Contains optimizers to update model parameters
from torch.utils.data import DataLoader  # For batching and shuffling datasets
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanOcrEnsembl  # Genomic dataset
from betamark import ocr  # OCR model, potentially for additional text processing

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")


Using cuda device


In [13]:
train_dset = HumanOcrEnsembl(split='train', version=0)
test_dset = HumanOcrEnsembl(split='test', version=0)

train_loader = DataLoader(train_dset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dset, batch_size=32, shuffle=False)


In [14]:
class GenomicCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(GenomicCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Convolutional layers with increased filter sizes
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=128, kernel_size=5)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=5)
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.5)  # Dropout layer with 50% dropout rate
        
        # Fully connected layer adjusted for the convolutional output dimensions
        self.fc = nn.Linear(256 * ((500 - 5 + 1) // 2 - 5 + 1) // 2, num_classes)

    def forward(self, x):
        # Forward pass through the model layers
        x = self.embedding(x).permute(0, 2, 1)  # Embedding followed by permutation for Conv1d input
        x = self.dropout(self.pool1(torch.relu(self.conv1(x))))
        x = self.dropout(self.pool2(torch.relu(self.conv2(x))))
        x = x.view(x.size(0), -1)  # Flatten for the fully connected layer
        x = self.fc(x)
        return torch.sigmoid(x)  # Sigmoid activation for binary classification


In [15]:
vocab = {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'N': 0}  # Vocabulary mapping for nucleotide encoding
vocab_size = len(vocab)
embed_dim = 200  # Increased embedding dimension for richer feature representation
num_classes = 1  # Binary classification

# Model instantiation and device assignment
model = GenomicCNN(vocab_size, embed_dim, num_classes).to(device)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for binary classification
optimizer = optim.AdamW(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)


In [16]:
def sequence_to_tensor(sequence, vocab, max_length=500):
    indices = [vocab.get(char, 0) for char in sequence]  # Encoding sequence based on vocabulary
    # Padding or trimming to a fixed length
    if len(indices) < max_length:
        indices += [0] * (max_length - len(indices))
    elif len(indices) > max_length:
        indices = indices[:max_length]
    return torch.tensor(indices, dtype=torch.long)


In [19]:
epochs = 20  # Number of training epochs
best_accuracy = 0  # Initialize best accuracy to track the best model

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    correct = 0
    total = 0

    for x, y in train_loader:
        # Convert sequences to padded tensors
        x = torch.stack([sequence_to_tensor(seq, vocab) for seq in x]).to(device)
        y = y.to(device).float()  # Move labels to device and convert to float for BCE loss

        optimizer.zero_grad()  # Zero the gradients

        # Forward pass
        output = model(x)
        loss = criterion(output, y.unsqueeze(1))  # Calculate loss

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # Accumulate loss
        correct += ((output > 0.5).float() == y.unsqueeze(1)).sum().item()  # Accuracy calculation
        total += y.size(0)

    # Calculate and print epoch loss and accuracy
    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")

    # Check if this epoch's accuracy is the best so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), "best_genomic_cnn_model.pth")  # Save the best model state dict
        print(f"New best model saved with accuracy: {best_accuracy:.2f}%")

    # Update the learning rate scheduler
    scheduler.step(total_loss / len(train_loader))


Epoch 1/20, Loss: 0.6364, Accuracy: 63.40%
New best model saved with accuracy: 63.40%
Epoch 2/20, Loss: 0.6295, Accuracy: 64.28%
New best model saved with accuracy: 64.28%
Epoch 3/20, Loss: 0.6236, Accuracy: 64.88%
New best model saved with accuracy: 64.88%
Epoch 4/20, Loss: 0.6203, Accuracy: 65.26%
New best model saved with accuracy: 65.26%
Epoch 5/20, Loss: 0.6183, Accuracy: 65.50%
New best model saved with accuracy: 65.50%
Epoch 6/20, Loss: 0.6164, Accuracy: 65.74%
New best model saved with accuracy: 65.74%
Epoch 7/20, Loss: 0.6147, Accuracy: 65.82%
New best model saved with accuracy: 65.82%
Epoch 8/20, Loss: 0.6138, Accuracy: 65.93%
New best model saved with accuracy: 65.93%
Epoch 9/20, Loss: 0.6125, Accuracy: 66.01%
New best model saved with accuracy: 66.01%
Epoch 10/20, Loss: 0.6119, Accuracy: 66.22%
New best model saved with accuracy: 66.22%
Epoch 11/20, Loss: 0.6114, Accuracy: 66.15%
Epoch 12/20, Loss: 0.6097, Accuracy: 66.34%
New best model saved with accuracy: 66.34%
Epoch 13

In [23]:
from betamark import ocr

# Define the prediction function for OCR binary classification
def model_predict(sequence):
    # Convert the sequence to a tensor and add a batch dimension
    input_tensor = sequence_to_tensor(sequence, vocab).unsqueeze(0).to(device)  # [1, max_length]
    
    # Set model to evaluation mode and disable gradient tracking
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
    
    # Convert output to binary prediction (0 or 1)
    y_pred = int((output > 0.5).item())  # Output is 0 if not OCR, 1 if OCR
    return y_pred


# Run the OCR evaluation using `betamark`
ocr.run_eval(user_func=model_predict)

100%|██████████| 2/2 [00:00<00:00, 620.73it/s]


{'acc': 0.5}