In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Load data
train_df = pd.read_csv('train.dat', delimiter='\t', names=["labels", "sequence"])
test_df = pd.read_csv('test.dat', delimiter='\t', names=["sequence"])

# Vectorize sequences
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 4))
X_train = vectorizer.fit_transform(train_df['sequence']).toarray()
# Convert labels from -1 and 1 to 0 and 1
y_train = ((train_df['labels'] + 1) / 2).values  # Converts -1 to 0 and 1 to 1

X_test = vectorizer.transform(test_df['sequence']).toarray()

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Create Datasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)


In [2]:
import torch.nn as nn

class SequenceClassifier(nn.Module):
    def __init__(self, num_features):
        super(SequenceClassifier, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()  # Use Sigmoid for binary classification

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

model = SequenceClassifier(num_features=X_train.shape[1])


In [3]:
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Epoch 1, Loss: 0.24848389625549316
Epoch 2, Loss: 0.062069643288850784
Epoch 3, Loss: 0.021208621561527252
Epoch 4, Loss: 0.0073590646497905254
Epoch 5, Loss: 0.0023329881951212883
Epoch 6, Loss: 0.0020144085865467787
Epoch 7, Loss: 0.00046521524200215936
Epoch 8, Loss: 0.00039137215935625136
Epoch 9, Loss: 0.00016243994468823075
Epoch 10, Loss: 0.00017104078142438084


In [9]:
from sklearn.metrics import matthews_corrcoef

# Split the training data to include a validation set
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train_tensor, y_train_tensor, test_size=0.2, random_state=42)

# Create DataLoaders for the training and validation sets
train_dataset = TensorDataset(X_train_val, y_train_val)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

validation_dataset = TensorDataset(X_val, y_val)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)

# Reinitialize the model to ensure clean start
model = SequenceClassifier(num_features=X_train.shape[1])

# Setup the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

# Training and validation loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation loop
    model.eval()
    val_losses = []
    val_predictions = []
    val_targets = []
    with torch.no_grad():
        for inputs, labels in validation_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_losses.append(loss.item())
            val_predictions.extend(outputs.squeeze().tolist())
            val_targets.extend(labels.squeeze().tolist())

    # Calculate validation metrics
    val_predictions_bin = torch.tensor(val_predictions) > 0.5  # Binarize predictions
    val_targets = torch.tensor(val_targets)
    average_val_loss = sum(val_losses) / len(val_losses)
    mcc = matthews_corrcoef(val_targets.numpy(), val_predictions_bin.numpy())

    print(f'Epoch {epoch+1}, Train Loss: {loss.item()}, Val Loss: {average_val_loss}, Val MCC: {mcc}')


Epoch 1, Train Loss: 0.13789811730384827, Val Loss: 0.18859618306159973, Val MCC: 0.0
Epoch 2, Train Loss: 0.04857424646615982, Val Loss: 0.09058097153902053, Val MCC: 0.8984015288974675
Epoch 3, Train Loss: 0.017839808017015457, Val Loss: 0.051862097531557086, Val MCC: 0.902241594022416
Epoch 4, Train Loss: 0.00837368331849575, Val Loss: 0.04152483530342579, Val MCC: 0.9253957342682618
Epoch 5, Train Loss: 0.004786792676895857, Val Loss: 0.040681752283126114, Val MCC: 0.9502139849488279
Epoch 6, Train Loss: 0.003310178406536579, Val Loss: 0.04159699734300375, Val MCC: 0.9502139849488279
Epoch 7, Train Loss: 0.0025816955603659153, Val Loss: 0.04235920375213027, Val MCC: 0.9502139849488279
Epoch 8, Train Loss: 0.002181407529860735, Val Loss: 0.04272624896839261, Val MCC: 0.9502139849488279
Epoch 9, Train Loss: 0.00181833584792912, Val Loss: 0.043630997510626914, Val MCC: 0.9502139849488279
Epoch 10, Train Loss: 0.0016453900607302785, Val Loss: 0.04383325569797307, Val MCC: 0.95021398494

In [10]:
# Step 5: Predict on Test Data
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    predicted_labels = (predictions > 0.5).float() * 2 - 1  # Using 0.5 as the threshold and adjusting to -1 and 1

# Optionally, save these predictions to a CSV file or similar
predicted_labels_np = predicted_labels.numpy()
np.savetxt("test_predictions.txt", predicted_labels_np, delimiter=",", fmt="%d")

