In [1]:
%%capture
!pip install transformers
!pip install datasets
# install this module for extracting info from fas file instead of doing by hand
!pip install biopython

In [12]:
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.optim as optim
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer

Our dataset comprises a folder containing ten FASTA files, each file consisting of numerous sequence names referred to as "sequences." In addition to the existing data, we are introducing an additional class as a label. This label will be represented by the file names, indicating the family to which the various sequences belong.

In [13]:
class SequenceBuilder:
    def __init__(self, folder_path):
        self.folder_path = folder_path

    def sequence_builder(self):
        sequences = {}
        data = []

        for filename in os.listdir(self.folder_path):
            file_path = os.path.join(self.folder_path, filename)
            if os.path.isfile(file_path):  # Exclude directories
                for record in SeqIO.parse(file_path, "fasta"):
                    sequence_name = record.id
                    sequence = str(record.seq)
                    sequences[sequence_name] = [sequence, filename]

        # Build a dataframe
        for key, value in sequences.items():
            data.append([key, value[0], value[1]])
        df = pd.DataFrame(data, columns=['seq name', 'seq', 'class or file name'])
        return df


In [14]:
#build a dataframe# Example usage
folder_path =  "/content/drive/MyDrive/cs612_sequences"
builder = SequenceBuilder(folder_path)
df = builder.sequence_builder()
#make sure to change the directory before saving or it will go exacly to the fas file location
#df.to_csv('Sequence DataFrame.csv', index = False)

In [None]:
#reads the file as a csv. if we ever wanted to change anything about the csv, we have to add it here
class CSVHandler():
  def __init__(self,file_path):
    self.file_path = file_path

  def read_csv(self, file_path):
     df = pd.read_csv(self.file_path)
     return df


In [16]:
file_path = '/content/drive/MyDrive/new_dataset.csv'
csvhandler = CSVHandler(file_path)

#the dataframe that will be used:
df = csvhandler.read_csv(file_path)

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize and encode the sequences
train_encodings = tokenizer(train_df['seq'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['seq'].tolist(), truncation=True, padding=True)

# Prepare the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['class or file name'].tolist())
val_labels = label_encoder.transform(val_df['class or file name'].tolist())


In [None]:
# Map encoded labels back to original classes
train_class_names = label_encoder.inverse_transform(train_labels)
val_class_names = label_encoder.inverse_transform(val_labels)
"""
# Print the mapping
for label, class_name in zip(train_labels, train_class_names):
    print(f"Encoded label: {label}, Class name: {class_name}")
"""# Create a set to store unique labels and class names
unique_labels = set()
for label, class_name in zip(train_labels, train_class_names):
    unique_labels.add((label, class_name))

# Print the unique labels and class names
for label, class_name in unique_labels:
    print(f"Encoded label: {label}, Class name: {class_name}")


In [17]:
# Define the dataset
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [18]:
# Create instances of the dataset
train_dataset = SequenceDataset(train_encodings, train_labels)
val_dataset = SequenceDataset(val_encodings, val_labels)

In [19]:
# Define the model
class MyModel(nn.Module):
    def __init__(self, num_classes):
        super(MyModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits


In [20]:
# Set the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create an instance of the model and move it to the device
model = MyModel(num_classes=len(label_encoder.classes_)).to(device)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)  # Adjust learning rate if needed

# Training loop
num_epochs = 10
best_val_accuracy = 0.0  # Track the best validation accuracy



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
import torch

class Trainer:
    def __init__(self, model, train_loader, val_loader, criterion, optimizer, device):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.criterion = criterion
        self.optimizer = optimizer
        self.device = device
        self.best_val_accuracy = 0.0

    def train(self, num_epochs):
        for epoch in range(num_epochs):
            self.model.train()
            train_loss = 0.0
            train_correct = 0
            train_total = 0

            for batch in self.train_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                self.optimizer.zero_grad()

                outputs = self.model(input_ids, attention_mask)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()

                train_loss += loss.item()
                _, predicted = torch.max(outputs, dim=1)
                train_correct += (predicted == labels).sum().item()
                train_total += labels.size(0)

            train_loss /= len(self.train_loader)
            train_accuracy = train_correct / train_total

            self.model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0

            with torch.no_grad():
                for batch in self.val_loader:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['labels'].to(self.device)

                    outputs = self.model(input_ids, attention_mask)
                    loss = self.criterion(outputs, labels)

                    val_loss += loss.item()
                    _, predicted = torch.max(outputs, dim=1)
                    val_correct += (predicted == labels).sum().item()
                    val_total += labels.size(0)

            val_loss /= len(self.val_loader)
            val_accuracy = val_correct / val_total

            print(f"Epoch {epoch+1}/{num_epochs}:")
            print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.2%}")
            print(f"Validation Loss: {val_loss:.4f} | Validation Acc: {val_accuracy:.2%}")

            # Save the model if it has the best validation accuracy so far
            if val_accuracy > self.best_val_accuracy:
                torch.save(self.model.state_dict(), 'best_model.pt')
                self.best_val_accuracy = val_accuracy

            # Debugging: Print some predictions and targets
            self.model.eval()
            with torch.no_grad():
                sample_inputs = next(iter(self.val_loader))
                input_ids = sample_inputs['input_ids'].to(self.device)
                attention_mask = sample_inputs['attention_mask'].to(self.device)
                labels = sample_inputs['labels'].to(self.device)

                outputs = self.model(input_ids, attention_mask)
                _, predicted = torch.max(outputs, dim=1)

                # Convert label indices to class labels
                predicted_classes = label_encoder.inverse_transform(predicted.cpu().numpy())
                target_classes = label_encoder.inverse_transform(labels.cpu().numpy())

                print("Sample Predictions:")
                print("Predicted:", predicted_classes[:5])
                print("Target:", target_classes[:5])

trainer = Trainer(model, train_loader, val_loader, criterion, optimizer, device)
trainer.train(num_epochs)


Epoch 1/10:
Train Loss: 2.3023 | Train Acc: 15.49%
Validation Loss: 2.2829 | Validation Acc: 13.24%
Epoch 2/10:
Train Loss: 2.2923 | Train Acc: 16.30%
Validation Loss: 2.3052 | Validation Acc: 13.24%
Epoch 3/10:
Train Loss: 2.2575 | Train Acc: 15.71%
Validation Loss: 2.2865 | Validation Acc: 13.24%
Epoch 4/10:
Train Loss: 2.2622 | Train Acc: 16.96%
Validation Loss: 2.2609 | Validation Acc: 18.82%
Epoch 5/10:
Train Loss: 2.2517 | Train Acc: 17.48%
Validation Loss: 2.2906 | Validation Acc: 11.76%
Epoch 6/10:
Train Loss: 2.2640 | Train Acc: 16.22%
Validation Loss: 2.3038 | Validation Acc: 13.24%
Epoch 7/10:
Train Loss: 2.2552 | Train Acc: 15.93%
Validation Loss: 2.2553 | Validation Acc: 18.82%
Epoch 8/10:
Train Loss: 2.2525 | Train Acc: 17.40%
Validation Loss: 2.2665 | Validation Acc: 13.24%
Epoch 9/10:
Train Loss: 2.2507 | Train Acc: 16.37%
Validation Loss: 2.3115 | Validation Acc: 13.24%
Epoch 10/10:
Train Loss: 2.2695 | Train Acc: 16.45%
Validation Loss: 2.2655 | Validation Acc: 13.24%

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_path = '/content/drive/MyDrive/NEW_model_with_dash.pt'
torch.save(model.state_dict(), model_path)


You can run the code below separately.

In [None]:
import torch
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder

class Predictor:
    def __init__(self, model, label_encoder):
        self.model = model
        self.label_encoder = label_encoder
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def predict(self, sequence):
        # Tokenize and encode the input sequence
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        inputs = tokenizer(sequence, truncation=True, padding=True, return_tensors='pt')

        # Move the input tensors to the appropriate device
        inputs = {key: val.to(self.device) for key, val in inputs.items()}

        # Set the model to evaluation mode
        self.model.eval()

        # Perform the prediction
        with torch.no_grad():
            logits = self.model(inputs['input_ids'], inputs['attention_mask'])
            probabilities = torch.softmax(logits, dim=1)
            predicted_label = torch.argmax(probabilities, dim=1).item()

        # Convert the predicted label index to the original class name
        predicted_class = self.label_encoder.inverse_transform([predicted_label])[0]

        return predicted_class


In [None]:
#EXAMPLE:
# Instantiate the predictor class
predictor = Predictor(model, label_encoder)

# Perform the prediction
random_sequence = 'MAEELSWKQDGATLHFFGELDGVTVNSLWQQREKMVTGINLFELSGLTRVDTAGLALLIHLTAIVARQGNKIELAAATDNLRTLAQLYNLPEALLPH----KTVEITNKLGMHARPAMKLFELVQSFDAEVMLRNEAGTEAEASSVIALLMLDSAKGGHIEIEVTGPEEEQALAAVIALFNAGFDED'
predicted_class = predictor.predict(random_sequence)

print("Predicted class:", predicted_class)



Encoded label: 1, Class name: ISCA_ISCR_20_id90.fas

Encoded label: 7, Class name: YDBL_YNBE_20_id90.fas

Encoded label: 8, Class name: YEFM_YOEB_20_id90.fas

Encoded label: 6, Class name: YAGP_YAHO_20_id90.fas

Encoded label: 9, Class name: YFIB_YJAB_20_id90.fas

Encoded label: 2, Class name: MLAB_PTSO_20_id90.fas

Encoded label: 3, Class name: RNPA_YBCJ_20_id90.fas

Encoded label: 0, Class name: CSPD_IF1_20_id90.fas

Encoded label: 5, Class name: TESA_THIO_20_id90.fas

Encoded label: 4, Class name: SLYX_TUSB_20_id90.fas

