In [5]:
%%capture
!pip install transformers
!pip install pytorch-lightning
!pip install datasets
# install this module for extracting info from fas file instead of doing by hand
!pip install biopython

In [6]:
#base of the model
import transformers
import torch
import torch.nn as nn
#for data manipulation
import pandas as pd
import os
import torch.optim as optim
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer
from transformers import BertModel, AutoTokenizer, BertTokenizer
from Bio import SeqIO
from torch.utils.data import Dataset, DataLoader
#for spliting the data in train and test
from sklearn import model_selection
from torch import nn, optim


In [7]:
#change the directory to your directory where your sequences are
#%cd /content/drive/MyDrive/cs612_sequences

Our dataset comprises a folder containing ten FASTA files, each file consisting of numerous sequence names referred to as "sequences." In addition to the existing data, we are introducing an additional class as a label. This label will be represented by the file names, indicating the family to which the various sequences belong.

In [8]:
class SequenceBuilder:
    def __init__(self, folder_path):
        self.folder_path = folder_path

    def sequence_builder(self):
        sequences = {}
        data = []

        for filename in os.listdir(self.folder_path):
            file_path = os.path.join(self.folder_path, filename)
            if os.path.isfile(file_path):  # Exclude directories
                for record in SeqIO.parse(file_path, "fasta"):
                    sequence_name = record.id
                    sequence = str(record.seq)
                    sequences[sequence_name] = [sequence, filename]

        # Build a dataframe
        for key, value in sequences.items():
            data.append([key, value[0], value[1]])
        df = pd.DataFrame(data, columns=['seq name', 'seq', 'class or file name'])
        return df


In [9]:
#build a dataframe# Example usage
folder_path =  "/content/drive/MyDrive/cs612_sequences"
builder = SequenceBuilder(folder_path)
df = builder.sequence_builder()

#make sure to change the directory before saving or it will go exacly to the fas file location
#df.to_csv('Sequence DataFrame.csv', index = False)

In [10]:
df = pd.read_csv('/content/drive/MyDrive/Sequence DataFrame.csv')

In [11]:
df.head()

Unnamed: 0,seq name,seq,class or file name
0,,MSITLSDSAAARVNTFLANRGKGFGLRLGVRTSGCSGMAYVLEFVD...,ISCA_ISCR_20_id90.fas
1,B629741_C4GI86_C4GI79_7,---------------------------DYQEKKLREQGVAVQGDQI...,YFIB_YJAB_20_id90.fas
2,B439851_B5FLC1_B5FLB9_2,------------------------------------TGVSVTRSGI...,YFIB_YJAB_20_id90.fas
3,B929713_H1NJT0_H1NJT3_3,---------------------------DRQAEEIKQEKVERVGEGI...,YFIB_YJAB_20_id90.fas
4,B350702_A2VW97_A2VXA5_8,------------------------------------TGTQVTEQPL...,YFIB_YJAB_20_id90.fas


In [12]:
#THIS BLOCK IS OK

df_seq_list = df[['seq']]
labels = df['class or file name']

max_len = 317

# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(df_seq_list, labels, test_size=0.2, random_state=42)

# Save the training data to a CSV file
train_X.to_csv('train_data.csv', index=False)
val_X.to_csv('val_data.csv', index=False)

print(len(train_X),len(train_y))

1356 1356


In [13]:
class DataHandler(Dataset):
    def __init__(self, data, labels, tokenizer, max_len=317):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sequence = self.data.iloc[index]['seq']
        label = self.labels.iloc[index]

        # Tokenize the sequence
        tokens = self.tokenizer.encode_plus(
            sequence,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Encode the label
        label_encoded = self.label_encoder.transform([label])[0]

        # One-hot encode the label
        label_one_hot = to_categorical(label_encoded, num_classes=10)

        return {
            'input_ids': tokens['input_ids'].flatten(),
            'attention_mask': tokens['attention_mask'].flatten(),
            'labels': label_one_hot
        }


In [14]:
#to use the pretrained model/weights
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

data_path = 'train_data.csv'  # Path to the CSV file
datahandler_train = DataHandler(train_X, train_y, tokenizer)
datahandler_val = DataHandler(val_X, val_y, tokenizer)

Now that we have initialized the DataHandler and prepared the training data, we can proceed with training your model

In [15]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define your MyModel class
class MyModel(nn.Module):
    def __init__(self, checkpoint, num_classes):
        super(MyModel, self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

        # Accessing BERT model's configuration
        self.bert_config = self.bert.config

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.linear(pooled_output)
        return logits


# build the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = MyModel('bert-base-uncased', 10)
model.bert_config.hidden_dropout_prob = 0.2



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
#we can change everything here to customize the model
model.bert_config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [17]:
class TrainingLoop:
    def __init__(self, model, datahandler_train, datahandler_valid, batch_size, loss_fn, optimizer):
        self.model = model
        self.train_loader = DataLoader(datahandler_train, batch_size=batch_size, shuffle=True)
        self.valid_loader = DataLoader(datahandler_valid, batch_size=batch_size, shuffle=False)
        self.loss_fn = loss_fn
        self.optimizer = optimizer

    def train(self):
        self.model.train()  # Set the model to training mode

        for batch in self.train_loader:
            # Training code

          self.model.eval()  # Set the model to evaluation mode
          total_loss = 0.0
          total_correct = 0
          total_samples = 0

        with torch.no_grad():
            for batch in self.valid_loader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels']

                outputs = self.model(input_ids, attention_mask)

                loss = self.loss_fn(outputs, torch.argmax(labels, dim=1))
                total_loss += loss.item()

                predictions = torch.argmax(outputs, dim=1)
                total_correct += (predictions == torch.argmax(labels, dim=1)).sum().item()
                total_samples += labels.size(0)

        avg_loss = total_loss / len(self.valid_loader)
        accuracy = total_correct / total_samples

        print('Validation Loss:', avg_loss)
        print('Validation Accuracy:', accuracy)


In [19]:
from torch import nn, optim
from torch.utils.data import DataLoader

batch_size = 32
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

training_loop = TrainingLoop(model, datahandler_train, datahandler_val, batch_size, loss_fn, optimizer)


In [20]:
training_loop.train()


Validation Loss: 2.3823482990264893
Validation Accuracy: 0.1
