In [24]:
%%capture
!pip install transformers
!pip install datasets
# install this module for extracting info from fas file instead of doing by hand
!pip install biopython

In [25]:
#imports
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.optim as optim
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer, BertTokenizer

Our dataset comprises a folder containing ten FASTA files, each file consisting of numerous sequence names referred to as "sequences." In addition to the existing data, we are introducing an additional class as a label. This label will be represented by the file names, indicating the family to which the various sequences belong.

In [26]:
class SequenceBuilder:
    def __init__(self, folder_path):
        self.folder_path = folder_path

    def sequence_builder(self):
        sequences = {}
        data = []

        for filename in os.listdir(self.folder_path):
            file_path = os.path.join(self.folder_path, filename)
            if os.path.isfile(file_path):  # Exclude directories
                for record in SeqIO.parse(file_path, "fasta"):
                    sequence_name = record.id
                    sequence = str(record.seq)
                    sequences[sequence_name] = [sequence, filename]

        # Build a dataframe
        for key, value in sequences.items():
            data.append([key, value[0], value[1]])
        df = pd.DataFrame(data, columns=['seq name', 'seq', 'class or file name'])
        return df


In [27]:
#build a dataframe# Example usage
folder_path =  "/content/drive/MyDrive/cs612_sequences"
builder = SequenceBuilder(folder_path)
df = builder.sequence_builder()
#make sure to change the directory before saving or it will go exacly to the fas file location
#df.to_csv('Sequence DataFrame.csv', index = False)

In [28]:
#reads the file as a csv. if we ever wanted to change anything about the csv, we have to add it here
class CSVHandler():
  def __init__(self,file_path):
    self.file_path = file_path

  def read_csv(self, file_path):
     df = pd.read_csv(self.file_path)
     return df


In [29]:
#if you what to remove dashes in your file
"""file_path = '/content/drive/MyDrive/new_dataset.csv'
csvhandler = CSVHandler(file_path)

#the dataframe that will be used:
df = csvhandler.read_csv(file_path)

# Specify the column name that contains the "-" characters
column_name = 'seq'

# Remove the "-" character from each element in the column
df[column_name] = df[column_name].str.replace('-', '')

# Save the modified dataset to a new file
df.to_csv('modified_dataset.csv', index=False)
"""

In [40]:
file_path = '/content/drive/MyDrive/new_dataset.csv'
csvhandler = CSVHandler(file_path)

#the dataframe that will be used:
df = csvhandler.read_csv(file_path)

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize and encode the sequences
train_encodings = tokenizer(train_df['seq'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['seq'].tolist(), truncation=True, padding=True)

# Prepare the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['class or file name'].tolist())
val_labels = label_encoder.transform(val_df['class or file name'].tolist())


In [32]:
# Map encoded labels back to original classes
train_class_names = label_encoder.inverse_transform(train_labels)
val_class_names = label_encoder.inverse_transform(val_labels)
"""
# Print the mapping
for label, class_name in zip(train_labels, train_class_names):
    print(f"Encoded label: {label}, Class name: {class_name}")
"""# Create a set to store unique labels and class names
unique_labels = set()
for label, class_name in zip(train_labels, train_class_names):
    unique_labels.add((label, class_name))

# Print the unique labels and class names
for label, class_name in unique_labels:
    print(f"Encoded label: {label}, Class name: {class_name}")


Encoded label: 9, Class name: YFIB_YJAB_20_id90.fas
Encoded label: 2, Class name: MLAB_PTSO_20_id90.fas
Encoded label: 8, Class name: YEFM_YOEB_20_id90.fas
Encoded label: 5, Class name: TESA_THIO_20_id90.fas
Encoded label: 0, Class name: CSPD_IF1_20_id90.fas
Encoded label: 3, Class name: RNPA_YBCJ_20_id90.fas
Encoded label: 7, Class name: YDBL_YNBE_20_id90.fas
Encoded label: 6, Class name: YAGP_YAHO_20_id90.fas
Encoded label: 4, Class name: SLYX_TUSB_20_id90.fas
Encoded label: 1, Class name: ISCA_ISCR_20_id90.fas


In [33]:
# Define the dataset
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [34]:
# Create instances of the dataset
train_dataset = SequenceDataset(train_encodings, train_labels)
val_dataset = SequenceDataset(val_encodings, val_labels)

In [35]:
# Define the model
class MyModel(nn.Module):
    def __init__(self, num_classes):
        super(MyModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits


In [36]:
# Set the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create an instance of the model and move it to the device
model = MyModel(num_classes=len(label_encoder.classes_)).to(device)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)  # Adjust learning rate if needed

# Training loop
num_epochs = 10
best_val_accuracy = 0.0  # Track the best validation accuracy



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, dim=1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)

    train_loss /= len(train_loader)
    train_accuracy = train_correct / train_total

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, dim=1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= len(val_loader)
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.2%}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Acc: {val_accuracy:.2%}")

    # Save the model if it has the best validation accuracy so far
    if val_accuracy > best_val_accuracy:
        torch.save(model.state_dict(), 'best_model.pt')
        best_val_accuracy = val_accuracy

    # Debugging: Print some predictions and targets
    model.eval()
    with torch.no_grad():
        sample_inputs = next(iter(val_loader))
        input_ids = sample_inputs['input_ids'].to(device)
        attention_mask = sample_inputs['attention_mask'].to(device)
        labels = sample_inputs['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)

        # Convert label indices to class labels
        predicted_classes = label_encoder.inverse_transform(predicted.cpu().numpy())
        target_classes = label_encoder.inverse_transform(labels.cpu().numpy())

        print("Sample Predictions:")
        print("Predicted:", predicted_classes[:5])
        print("Target:", target_classes[:5])

Epoch 1/10:
Train Loss: 1.7370 | Train Acc: 45.57%
Validation Loss: 1.3136 | Validation Acc: 63.25%
Sample Predictions:
Predicted: ['RNPA_YBCJ_20_id90.fas' 'YEFM_YOEB_20_id90.fas' 'YEFM_YOEB_20_id90.fas'
 'RNPA_YBCJ_20_id90.fas' 'YAGP_YAHO_20_id90.fas']
Target: ['RNPA_YBCJ_20_id90.fas' 'YDBL_YNBE_20_id90.fas' 'YEFM_YOEB_20_id90.fas'
 'RNPA_YBCJ_20_id90.fas' 'SLYX_TUSB_20_id90.fas']
Epoch 2/10:
Train Loss: 1.1305 | Train Acc: 67.52%
Validation Loss: 0.9594 | Validation Acc: 72.52%
Sample Predictions:
Predicted: ['RNPA_YBCJ_20_id90.fas' 'YEFM_YOEB_20_id90.fas' 'YEFM_YOEB_20_id90.fas'
 'RNPA_YBCJ_20_id90.fas' 'YAGP_YAHO_20_id90.fas']
Target: ['RNPA_YBCJ_20_id90.fas' 'YDBL_YNBE_20_id90.fas' 'YEFM_YOEB_20_id90.fas'
 'RNPA_YBCJ_20_id90.fas' 'SLYX_TUSB_20_id90.fas']
Epoch 3/10:
Train Loss: 0.7650 | Train Acc: 81.77%
Validation Loss: 0.6150 | Validation Acc: 88.74%
Sample Predictions:
Predicted: ['RNPA_YBCJ_20_id90.fas' 'YDBL_YNBE_20_id90.fas' 'YEFM_YOEB_20_id90.fas'
 'RNPA_YBCJ_20_id90.fas' '

In [38]:
from google.colab import drive
drive.mount('/content/drive')

model_path = '/content/drive/MyDrive/NEW_model_with_dash.pt'
torch.save(model.state_dict(), model_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
import torch
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder

# Random sequence to predict
random_sequence = "-------------------------PYPQIVLFGDSLFQGCAHVDGFSFQASLQCHVMRRFDVVNRGFSGWNTANALKYLPDIIAPPQLKYLLVLLGANDAVTGVPLAEYKQNLLKIVTHPNITAHKPLVTPP--PIDTGAKISAEYTQAARDVAAEVPVTLIDLWAALHPGGAAL-LPDGLHMSGEGYKVFYKIVVPHIGQEY---------VPLTAEKFQTLVTMTQDPWFVKFYAPWCHHCQAMAPNWQQLAKEMKGKLNIGEVNCDVESRLCKDVRLRGYPTILFFKGGERV-EYDGLRGLGDFVHYAEKA--"
# Tokenize and encode the input sequence
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(random_sequence, truncation=True, padding=True, return_tensors='pt')

# Move the input tensors to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = {key: val.to(device) for key, val in inputs.items()}

# Load your trained model
model = MyModel(num_classes=10)  # Replace with your trained model definition
model.load_state_dict(torch.load(model_path))  # Replace with the path to your trained model

# Move the model to the appropriate device
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize and fit the label encoder
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)  # Replace `train_labels` with the training labels used during training

# Create a dictionary mapping class numbers to class names
class_names = label_encoder.classes_
class_number_to_name = {class_number: class_name for class_number, class_name in enumerate(class_names)}

# Perform the prediction
with torch.no_grad():
    logits = model(inputs['input_ids'], inputs['attention_mask'])
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()

# Convert the predicted label index to the original class name
predicted_class = class_number_to_name[predicted_label]

print("Predicted class:", predicted_class)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted class: 5
